1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/* 2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * 4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * Use of this source code is governed by a BSD-style license 5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * that can be found in the LICENSE file in the root of the source 6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * tree. An additional intellectual property rights grant can be found 7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * in the file PATENTS. All contributing project authors may 8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian * be found in the AUTHORS file in the root of the source tree. 9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */ 10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "libyuv/row.h" 127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "libyuv/scale_row.h" 13ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 14ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus 15ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramaniannamespace libyuv { 16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianextern "C" { 17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif 18ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 19ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// This module is for Visual C x86. 207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ 217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian defined(_MSC_VER) && !defined(__clang__) 22ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 23ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 0 to 9 24ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf0 = 25ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 26ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 27ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 28ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf1 = 29ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 30ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 31ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 32ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf2 = 33ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 34ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 35ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 0 to 10 36ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf01 = 37ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 38ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 39ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 40ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf11 = 41ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 42ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 43ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 44ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf21 = 45ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 46ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 47ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Coefficients for source bytes 0 to 10 48ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kMadd01 = 49ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 50ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 51ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Coefficients for source bytes 10 to 21 52ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kMadd11 = 53ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 54ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 55ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Coefficients for source bytes 21 to 31 56ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kMadd21 = 57ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 58ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 59ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Coefficients for source bytes 21 to 31 60ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic vec16 kRound34 = 61ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 2, 2, 2, 2, 2, 2, 2, 2 }; 62ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 63ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf38a = 64ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 65ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 66ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf38b = 67ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 68ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 69ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange words 0,3,6 into 0,1,2 70ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAc = 71ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 72ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 73ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange words 0,3,6 into 3,4,5 74ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAc3 = 75ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 76ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 77ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scaling values for boxes of 3x3 and 2x3 78ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec16 kScaleAc33 = 79ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 80ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 81ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange first value for pixels 0,1,2,3,4,5 82ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAb0 = 83ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 84ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 85ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange second value for pixels 0,1,2,3,4,5 86ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAb1 = 87ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 88ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 89ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange third value for pixels 0,1,2,3,4,5 90ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAb2 = 91ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 92ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 93ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scaling values for boxes of 3x2 and 2x2 94ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec16 kScaleAb2 = 95ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 96ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 97ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 32 pixels, throws half away and writes 16 pixels. 987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 99ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride ignored 104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_ptr 105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 // isolate odd pixels. 112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 32x1 rectangle to 16x1. 1247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride 130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_ptr 131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 8 134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 1367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 1377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm1 143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm5 145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm3, xmm5 146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgw xmm0, xmm2 147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgw xmm1, xmm3 148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 1527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 32x2 rectangle to 16x1. 1607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_ptr 166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_stride 167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_ptr 168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // dst_width 169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm5, 8 171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 1737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 1747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 1757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + esi] 1767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + esi + 16] 177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 // average rows 179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm3 180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm1 184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm5 186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm3, xmm5 187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgw xmm0, xmm2 188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgw xmm1, xmm3 189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 1937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_SCALEROWDOWN2_AVX2 2027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Reads 64 pixels, throws half away and writes 32 pixels. 2037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 2057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride ignored 209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_ptr 210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 2137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 2147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 2157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 2167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 // isolate odd pixels. 2177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm1, ymm1, 8 2187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 2197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 2207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 2217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 32] 2227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 2237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg wloop 2247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 2257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Blends 64x1 rectangle to 32x1. 2317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 2337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 2367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // src_stride 2377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_ptr 2387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 2417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm4, ymm4, 15 2427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm4, ymm4, ymm4 2437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpxor ymm5, ymm5, ymm5 // constant 0 244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian wloop: 2467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 2477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 2487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 2497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 2507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmaddubsw ymm0, ymm0, ymm4 // average horizontally 2517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmaddubsw ymm1, ymm1, ymm4 2527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 2537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgw ymm1, ymm1, ymm5 2547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 2557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 2567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 2577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 2587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 32] 2597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 2607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg wloop 2617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 2627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Blends 64x2 rectangle to 32x1. 2687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 2697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 2707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 2727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push esi 2737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_ptr 2747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_stride 2757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_ptr 2767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // dst_width 277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b 2797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm4, ymm4, 15 2807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm4, ymm4, ymm4 2817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpxor ymm5, ymm5, ymm5 // constant 0 282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 2837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian wloop: 2847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // average rows 2857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 2867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, [eax + esi] 2877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm1, ymm1, [eax + esi + 32] 2887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 2897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 2907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmaddubsw ymm0, ymm0, ymm4 // average horizontally 2917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpmaddubsw ymm1, ymm1, ymm4 2927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 2937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgw ymm1, ymm1, ymm5 2947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 2957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 2967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 2977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 2987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 32] 2997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 3007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg wloop 3017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 3027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop esi 3037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 3077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_SCALEROWDOWN2_AVX2 308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Point samples 32 pixels to 8 pixels. 3107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride ignored 316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_ptr 317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrld xmm5, 24 320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pslld xmm5, 16 321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 3237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 3247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm0, xmm5 327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm1, xmm5 328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 3337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 32x4 rectangle to 8x1. 3417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_ptr 348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_stride 349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_ptr 350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 16] // dst_width 351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [esi + esi * 2] // src_stride * 3 352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff 353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm7, 8 354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 3567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // average rows 3577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 3587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + esi] 3597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + esi + 16] 3607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm0, xmm2 361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm3 3627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + esi * 2] 3637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + esi * 2 + 16] 3647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm4, [eax + edi] 3657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm5, [eax + edi + 16] 366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm2, xmm4 368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm3, xmm5 369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm3 371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // average columns (32 to 16 pixels) 373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, xmm1 375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 8 376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm7 377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm3, xmm7 378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgw xmm0, xmm2 379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgw xmm1, xmm3 380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm1 381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // average columns (16 to 8 pixels) 383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 8 384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pand xmm2, xmm7 385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgw xmm0, xmm2 386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 8] 3907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 3997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_SCALEROWDOWN4_AVX2 4007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Point samples 64 pixels to 16 pixels. 4017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 4037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 4047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 4057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 4067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // src_stride ignored 4077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_ptr 4087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 4097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 4107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrld ymm5, ymm5, 24 4117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpslld ymm5, ymm5, 16 4127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 4137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian wloop: 4147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] 4157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 4167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 4177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm0, ymm0, ymm5 4187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm1, ymm1, ymm5 4197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 4207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 4217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 4227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm0 4237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 4247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], xmm0 4257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 4267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 4277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg wloop 4287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 4297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 4307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 4317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 4327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 4337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 4347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Blends 64x4 rectangle to 16x1. 4357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 4367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, 4377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 4387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 4397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push esi 4407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian push edi 4417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_ptr 4427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_stride 4437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8 + 12] // dst_ptr 4447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 8 + 16] // dst_width 4457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edi, [esi + esi * 2] // src_stride * 3 4467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff 4477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm7, ymm7, 8 4487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 4497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian wloop: 4507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm0, [eax] // average rows 4517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm1, [eax + 32] 4527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, [eax + esi] 4537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm1, ymm1, [eax + esi + 32] 4547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm2, [eax + esi * 2] 4557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm3, [eax + esi * 2 + 32] 4567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm2, ymm2, [eax + edi] 4577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm3, ymm3, [eax + edi + 32] 4587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 64] 4597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm0, ymm0, ymm2 4607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgb ymm1, ymm1, ymm3 4617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 4627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) 4637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm3, ymm1, ymm7 4647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 4657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm1, ymm1, 8 4667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgw ymm0, ymm0, ymm2 4677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgw ymm1, ymm1, ymm3 4687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm1 4697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 4707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 4717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) 4727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpsrlw ymm0, ymm0, 8 4737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpavgw ymm0, ymm0, ymm2 4747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpackuswb ymm0, ymm0, ymm0 4757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb 4767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 4777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], xmm0 4787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 16] 4797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 16 4807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg wloop 4817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 4827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop edi 4837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pop esi 4847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 4857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 4867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 4877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 4887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_SCALEROWDOWN4_AVX2 4897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Point samples 32 pixels to 24 pixels. 491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Then shuffled to do the scaling. 493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 4947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride ignored 500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_ptr 501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, kShuf0 503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShuf1 504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kShuf2 505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 5077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 5087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm1 511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian palignr xmm1, xmm0, 8 512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm3 513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm4 514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm2, xmm5 515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + 8], xmm1 517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + 16], xmm2 518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 24] 519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 24 520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 32x2 rectangle to 24x1 527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Produces three 8 byte values. For each 8 bytes, 16 bytes are read. 528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Then shuffled to do the scaling. 529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Register usage: 531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm0 src_row 0 532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm1 src_row 1 533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm2 shuf 0 534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm3 shuf 1 535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm4 shuf 2 536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm5 madd 0 537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm6 madd 1 538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm7 kRound34 539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Note that movdqa+palign may be better than movdqu. 5417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, 544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_ptr 548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_stride 549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_ptr 550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // dst_width 551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, kShuf01 552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, kShuf11 553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShuf21 554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kMadd01 555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, kMadd11 556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm7, kRound34 557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 5597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // pixels 0..7 5607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + esi] 561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm2 563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm5 564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddsw xmm0, xmm7 565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 2 566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax + 8] // pixels 8..15 569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + esi + 8] 570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm3 572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm6 573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddsw xmm0, xmm7 574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 2 575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + 8], xmm0 5777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax + 16] // pixels 16..23 5787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + esi + 16] 579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, kMadd21 583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm1 584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddsw xmm0, xmm7 585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 2 586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + 16], xmm0 588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 24] 5897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 24 590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Note that movdqa+palign may be better than movdqu. 5987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, 601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_ptr 605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_stride 606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_ptr 607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // dst_width 608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, kShuf01 609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, kShuf11 610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShuf21 611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kMadd01 612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, kMadd11 613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm7, kRound34 614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 6167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // pixels 0..7 6177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + esi] 618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm0 619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm2 621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm5 622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddsw xmm0, xmm7 623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 2 624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm0, [eax + 8] // pixels 8..15 627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu xmm1, [eax + esi + 8] 628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm0 629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm3 631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm6 632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddsw xmm0, xmm7 633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 2 634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + 8], xmm0 6367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax + 16] // pixels 16..23 6377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + esi + 16] 638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm0 640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm1 641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, kMadd21 643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm1 644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddsw xmm0, xmm7 645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 2 646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx + 16], xmm0 648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx+24] 6497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 24 650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 3/8 point sampler 658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scale 32 pixels to 12 6607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride ignored 666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_ptr 667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShuf38a 669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kShuf38b 670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop: 6727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 6737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm5 677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusb xmm0, xmm1 678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edx], xmm0 // write 12 pixels 680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhlps xmm1, xmm0 681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edx + 8], xmm1 682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 12] 6837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 12 684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop 685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scale 16x3 pixels to 6x1 with interpolation 6917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, 694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_ptr 698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_stride 699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_ptr 700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // dst_width 701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, kShufAc 702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, kShufAc3 703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kScaleAc33 704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm5, xmm5 705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop: 7077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // sum up 3 rows into xmm0/1 7087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm6, [eax + esi] 709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhlps xmm1, xmm0 710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhlps xmm7, xmm6 711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm5 712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm1, xmm5 713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm6, xmm5 714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm7, xmm5 715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm0, xmm6 716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm1, xmm7 7177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm6, [eax + esi * 2] 718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhlps xmm7, xmm6 720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm6, xmm5 721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm7, xmm5 722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm0, xmm6 723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm1, xmm7 724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrldq xmm0, 2 727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm6, xmm0 728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrldq xmm0, 2 729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm6, xmm0 730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm6, xmm2 731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrldq xmm1, 2 734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm7, xmm1 735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrldq xmm1, 2 736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm7, xmm1 737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm7, xmm3 738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm6, xmm7 739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm6, xmm6 742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edx], xmm6 // write 6 pixels 744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlq xmm6, 16 745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edx + 2], xmm6 746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 6] 7477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 6 748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop 749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scale 16x2 pixels to 6x1 with interpolation 7567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, 759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_ptr, int dst_width) { 760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_ptr 763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_stride 764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_ptr 765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // dst_width 766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, kShufAb0 767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm3, kShufAb1 768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShufAb2 769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kScaleAb2 770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop: 7727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] // average 2 rows into xmm0 7737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + esi] 774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 7757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pavgb xmm0, xmm1 776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm2 779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm6, xmm0 780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm6, xmm3 781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm1, xmm6 782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm1, xmm0 784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm1, xmm1 787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edx], xmm1 // write 6 pixels 789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlq xmm1, 16 790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edx + 2], xmm1 791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 6] 7927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 6 793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop 794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 8007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Reads 16 bytes and accumulates to 16 shorts at a time. 8017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 8027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 8047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 8057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_ptr 8067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // src_width 8077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian pxor xmm5, xmm5 8087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 8097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // sum rows 810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop: 8117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax] // read 16 bytes 8127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 16] 8137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [edx] // read 16 words from destination 8147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [edx + 16] 8157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqa xmm2, xmm3 8167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian punpcklbw xmm2, xmm5 8177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian punpckhbw xmm3, xmm5 818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm0, xmm2 // sum 16 words 819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddusw xmm1, xmm3 8207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 // write 16 words to destination 8217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 8227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 32] 823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 16 824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg xloop 8257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian ret 8267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian } 8277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian} 828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 8297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_SCALEADDROW_AVX2 8307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Reads 32 bytes and accumulates to 32 shorts at a time. 8317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 8327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { 8337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian __asm { 8347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov eax, [esp + 4] // src_ptr 8357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov edx, [esp + 8] // dst_ptr 8367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian mov ecx, [esp + 12] // src_width 8377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpxor ymm5, ymm5, ymm5 8387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 8397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian // sum rows 8407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian xloop: 8417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu ymm3, [eax] // read 32 bytes 8427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea eax, [eax + 32] 8437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck 8447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpcklbw ymm2, ymm3, ymm5 8457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpunpckhbw ymm3, ymm3, ymm5 8467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpaddusw ymm0, ymm2, [edx] // sum 16 words 8477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vpaddusw ymm1, ymm3, [edx + 32] 8487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx], ymm0 // write 32 words to destination 8497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vmovdqu [edx + 32], ymm1 8507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian lea edx, [edx + 64] 8517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 8527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian jg xloop 8537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian 8547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian vzeroupper 855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 8587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // HAS_SCALEADDROW_AVX2 859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear column filtering. SSSE3 version. 8617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int dst_width, int x, int dx) { 864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push ebx 866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 12 + 4] // dst_ptr 869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 12 + 8] // src_ptr 870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12 + 12] // dst_width 871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, [esp + 12 + 16] // x 872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, [esp + 12 + 20] // dx 873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, 0x04040000 // shuffle to line up fractions with pixel. 874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm5, eax 875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm6, 9 877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw eax, xmm2, 1 // get x0 integer. preroll 878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 2 879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl xloop29 880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm0, xmm2 // x1 = x0 + dx 882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, xmm3 883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm2, xmm0 // x0 x1 884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm3, xmm3 // dx dx 885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm3, xmm3 // dx * 2, dx * 2 886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw edx, xmm2, 3 // get x1 integer. preroll 887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 2 Pixel loop. 889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop2: 890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm2 // x0, x1 fractions. 891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, xmm3 // x += dx 892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, ebx 894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 9 // 7 bit fractions. 895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, word ptr [esi + edx] // 2 source x1 pixels 896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm4, ebx 897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm5 // 0011 898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklwd xmm0, xmm4 899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm1, xmm6 // 0..7f and 7f..0 900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. 901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw eax, xmm2, 1 // get x0 integer. next iteration. 902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw edx, xmm2, 3 // get x1 integer. next iteration. 903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 // 8 bits, 2 pixels. 905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd ebx, xmm0 906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov [edi], bx 907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 2] 908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 2 // 2 pixels 909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge xloop2 910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop29: 912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add ecx, 2 - 1 914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl xloop99 915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 pixel remainder 917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movzx ebx, word ptr [esi + eax] // 2 source x0 pixels 918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, ebx 919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 9 // 7 bit fractions. 920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm2, xmm5 // 0011 921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm2, xmm6 // 0..7f and 7f..0 922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm2 // 16 bit 923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. 924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 // 8 bits 925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd ebx, xmm0 926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov [edi], bl 927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99: 929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop ebx 933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 16 pixels, duplicates them and writes 32 pixels. 9387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int dst_width, int x, int dx) { 941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4] // dst_ptr 943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8] // src_ptr 944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // dst_width 945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 9477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklbw xmm0, xmm0 951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhbw xmm1, xmm1 9527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 9537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 9557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 32 956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) 9637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDown2_SSE2(const uint8* src_argb, 965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, 966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int dst_width) { 967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride ignored 970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_argb 971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 9747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 9757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0xdd 9787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 9807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 8x1 rectangle to 4x1. 9887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, 991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int dst_width) { 992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // src_argb 994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride ignored 995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12] // dst_argb 996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 16] // dst_width 997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 9997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 10007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 1001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 1002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 1003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 // even pixels 1004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm1, 0xdd // odd pixels 1005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 10067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 1007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 10087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 1009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 1010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 8x2 rectangle to 4x1. 10167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, 1019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int dst_width) { 1020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4 + 4] // src_argb 1023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 4 + 8] // src_stride 1024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4 + 12] // dst_argb 1025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 4 + 16] // dst_width 1026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 10287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 10297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm1, [eax + 16] 10307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm2, [eax + esi] 10317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm3, [eax + esi + 16] 1032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 32] 1033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 // average rows 1034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm3 1035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 // even pixels 1037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm1, 0xdd // odd pixels 1038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 10397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 1040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 10417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 1042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 1043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 4 pixels at a time. 10507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int src_stepx, 1053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int dst_width) { 1054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push ebx 1056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8 + 4] // src_argb 1058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // src_stride ignored 1059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ebx, [esp + 8 + 12] // src_stepx 1060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 8 + 16] // dst_argb 1061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 20] // dst_width 1062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea ebx, [ebx * 4] 1063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [ebx + ebx * 2] 1064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 1066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, [eax] 1067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm1, [eax + ebx] 1068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm0, xmm1 1069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, [eax + ebx * 2] 1070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, [eax + edi] 1071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + ebx * 4] 1072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm2, xmm3 1073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklqdq xmm0, xmm2 10747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 1075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 10767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 1077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 1078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop ebx 1081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends four 2x2 to 4x1. 10867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ptrdiff_t src_stride, 1089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int src_stepx, 1090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian uint8* dst_argb, int dst_width) { 1091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push ebx 1093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 12 + 4] // src_argb 1096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 12 + 8] // src_stride 1097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ebx, [esp + 12 + 12] // src_stepx 1098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 12 + 16] // dst_argb 1099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12 + 20] // dst_width 1100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [eax + esi] // row1 pointer 1101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea ebx, [ebx * 4] 1102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [ebx + ebx * 2] 1103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 1105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm0, qword ptr [eax] // row0 4 pairs 1106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps xmm0, qword ptr [eax + ebx] 1107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm1, qword ptr [eax + ebx * 2] 1108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps xmm1, qword ptr [eax + edi] 1109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + ebx * 4] 1110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm2, qword ptr [esi] // row1 4 pairs 1111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps xmm2, qword ptr [esi + ebx] 1112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm3, qword ptr [esi + ebx * 2] 1113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps xmm3, qword ptr [esi + edi] 1114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea esi, [esi + ebx * 4] 1115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 // average rows 1116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm1, xmm3 1117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm2, xmm0 // average columns (8 to 4 pixels) 1118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm0, xmm1, 0x88 // even pixels 1119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shufps xmm2, xmm1, 0xdd // odd pixels 1120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pavgb xmm0, xmm2 11217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 1122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 16] 11237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 1124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 1125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop ebx 1129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Column scaling unfiltered. SSE2 version. 11347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int dst_width, int x, int dx) { 1137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 4] // dst_argb 1141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_argb 1142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 12] // dst_width 1143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, [esp + 8 + 16] // x 1144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, [esp + 8 + 20] // dx 1145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 1147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 1148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, xmm0 1149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm3, xmm3 // 0, 0, 0, dx * 2 1150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 1151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, xmm0 // x3 x2 x1 x0 1152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm3, xmm3 // 0, 0, 0, dx * 4 1153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 1154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw eax, xmm2, 1 // get x0 integer. 1156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw edx, xmm2, 3 // get x1 integer. 1157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cmp ecx, 0 1159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jle xloop99 1160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 4 1161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl xloop49 1162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 4 Pixel loop. 1164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop4: 1165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw eax, xmm2, 5 // get x2 integer. 1168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw edx, xmm2, 7 // get x3 integer. 1169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, xmm3 // x += dx 1170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm0, xmm1 // x0 x1 1171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm1, [esi + eax * 4] // 1 source x2 pixels 1173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm4, [esi + edx * 4] // 1 source x3 pixels 1174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm1, xmm4 // x2 x3 1177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 1178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqu [edi], xmm0 1179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 16] 11807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 4 // 4 pixels 1181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge xloop4 1182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop49: 1184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian test ecx, 2 1185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop29 1186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 2 Pixels. 1188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, [esi + eax * 4] // 1 source x0 pixels 1189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm1, [esi + edx * 4] // 1 source x1 pixels 1190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw eax, xmm2, 5 // get x2 integer. 1191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm0, xmm1 // x0 x1 1192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edi], xmm0 1194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 8] 1195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop29: 1197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian test ecx, 1 1198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian je xloop99 1199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 Pixels. 1201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm0, [esi + eax * 4] // 1 source x2 pixels 1202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd dword ptr [edi], xmm0 1203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99: 1204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. 1212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Port to Neon 1213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuffleColARGB = { 1216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 1219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for duplicating 2 fractions into 8 bytes each 1221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuffleFractions = { 1222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}; 1224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 12257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int dst_width, int x, int dx) { 1228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push esi 1230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian push edi 1231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edi, [esp + 8 + 4] // dst_argb 1232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov esi, [esp + 8 + 8] // src_argb 1233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8 + 12] // dst_width 1234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm2, [esp + 8 + 16] // x 1235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd xmm3, [esp + 8 + 20] // dx 1236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm4, kShuffleColARGB 1237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm5, kShuffleFractions 1238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. 1239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm6, 9 1240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw eax, xmm2, 1 // get x0 integer. preroll 1241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 2 1242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl xloop29 1243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm0, xmm2 // x1 = x0 + dx 1245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm0, xmm3 1246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm2, xmm0 // x0 x1 1247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm3, xmm3 // dx dx 1248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm3, xmm3 // dx * 2, dx * 2 1249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw edx, xmm2, 3 // get x1 integer. preroll 1250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 2 Pixel loop. 1252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop2: 1253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm2 // x0, x1 fractions. 1254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian paddd xmm2, xmm3 // x += dx 1255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm1, 9 // 7 bit fractions. 1257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels 1258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm1, xmm5 // 0000000011111111 1259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 // arrange pixels into pairs 1260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm1, xmm6 // 0..7f and 7f..0 1261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. 1262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw eax, xmm2, 1 // get x0 integer. next iteration. 1263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pextrw edx, xmm2, 3 // get x1 integer. next iteration. 1264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. 1265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. 1266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq qword ptr [edi], xmm0 1267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edi, [edi + 8] 1268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 2 // 2 pixels 1269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jge xloop2 1270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop29: 1272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian add ecx, 2 - 1 1274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jl xloop99 1275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian // 1 pixel remainder 1277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm2, 9 // 7 bit fractions. 1278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels 1279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm2, xmm5 // 00000000 1280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pshufb xmm0, xmm4 // arrange pixels into pairs 1281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pxor xmm2, xmm6 // 0..7f and 7f..0 1282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. 1283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian psrlw xmm0, 7 1284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. 1285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movd [edi], xmm0 1286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99: 1288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop edi 1290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian pop esi 1291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 4 pixels, duplicates them and writes 8 pixels. 12967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 1298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian int dst_width, int x, int dx) { 1299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov edx, [esp + 4] // dst_argb 1301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 8] // src_argb 1302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 12] // dst_width 1303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian wloop: 13057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu xmm0, [eax] 1306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea eax, [eax + 16] 1307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian movdqa xmm1, xmm0 1308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckldq xmm0, xmm0 1309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian punpckhdq xmm1, xmm1 13107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx], xmm0 13117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian movdqu [edx + 16], xmm1 1312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian lea edx, [edx + 32] 13137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian sub ecx, 8 1314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian jg wloop 1315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Divide num by div and return as 16.16 fixed point result. 13217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianint FixedDiv_X86(int num, int div) { 1323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // num 1325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cdq // extend num to 64 bits 1326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shld edx, eax, 16 // 32.16 1327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shl eax, 16 1328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian idiv dword ptr [esp + 8] 1329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 1332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Divide num by div and return as 16.16 fixed point result. 13347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked) 1335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianint FixedDiv1_X86(int num, int div) { 1336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian __asm { 1337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov eax, [esp + 4] // num 1338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian mov ecx, [esp + 8] // denom 1339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian cdq // extend num to 64 bits 1340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shld edx, eax, 16 // 32.16 1341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian shl eax, 16 1342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub eax, 0x00010001 1343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sbb edx, 0 1344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian sub ecx, 1 1345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian idiv ecx 1346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian ret 1347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian } 1348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} 13497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 1350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian 1351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus 1352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} // extern "C" 1353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian} // namespace libyuv 1354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif 1355