1ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian/*
2ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *
4ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  Use of this source code is governed by a BSD-style license
5ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  that can be found in the LICENSE file in the root of the source
6ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  tree. An additional intellectual property rights grant can be found
7ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  in the file PATENTS. All contributing project authors may
8ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian *  be found in the AUTHORS file in the root of the source tree.
9ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian */
10ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
11ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#include "libyuv/row.h"
127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "libyuv/scale_row.h"
13ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
14ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus
15ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramaniannamespace libyuv {
16ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianextern "C" {
17ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
18ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
19ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// This module is for Visual C x86.
207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    defined(_MSC_VER) && !defined(__clang__)
22ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
23ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 0 to 9
24ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf0 =
25ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
26ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
27ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf1 =
29ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
30ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
31ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf2 =
33ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
34ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
35ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 0 to 10
36ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf01 =
37ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
38ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
39ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
40ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf11 =
41ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
42ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
43ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
44ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf21 =
45ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
46ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
47ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Coefficients for source bytes 0 to 10
48ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kMadd01 =
49ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
50ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
51ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Coefficients for source bytes 10 to 21
52ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kMadd11 =
53ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
54ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
55ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Coefficients for source bytes 21 to 31
56ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kMadd21 =
57ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
58ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
59ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Coefficients for source bytes 21 to 31
60ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic vec16 kRound34 =
61ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 2, 2, 2, 2, 2, 2, 2, 2 };
62ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
63ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf38a =
64ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
65ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
66ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuf38b =
67ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
68ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
69ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange words 0,3,6 into 0,1,2
70ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAc =
71ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
72ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
73ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange words 0,3,6 into 3,4,5
74ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAc3 =
75ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
76ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
77ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scaling values for boxes of 3x3 and 2x3
78ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec16 kScaleAc33 =
79ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
80ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
81ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange first value for pixels 0,1,2,3,4,5
82ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAb0 =
83ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
84ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
85ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange second value for pixels 0,1,2,3,4,5
86ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAb1 =
87ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
88ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
89ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Arrange third value for pixels 0,1,2,3,4,5
90ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShufAb2 =
91ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
92ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
93ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scaling values for boxes of 3x2 and 2x2
94ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec16 kScaleAb2 =
95ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
96ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
97ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 32 pixels, throws half away and writes 16 pixels.
987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
99ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_ptr, int dst_width) {
101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_ptr
103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride ignored
104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12]       // dst_ptr
105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]       // dst_width
106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8               // isolate odd pixels.
112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8
113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
121ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
123ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 32x1 rectangle to 16x1.
1247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                              uint8* dst_ptr, int dst_width) {
127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_ptr
129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride
130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12]       // dst_ptr
131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]       // dst_width
132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 8
134ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
1367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
1377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8
142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm1
143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8
144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm5
145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm3, xmm5
146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgw      xmm0, xmm2
147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgw      xmm1, xmm3
148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
1527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 32x2 rectangle to 16x1.
1607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           uint8* dst_ptr, int dst_width) {
163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_ptr
166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]    // src_stride
167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]   // dst_ptr
168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // dst_width
169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm5, 8
171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
1737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
1747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
1757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + esi]
1767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + esi + 16]
177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2            // average rows
179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm3
180ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8
183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm1
184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8
185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm5
186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm3, xmm5
187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgw      xmm0, xmm2
188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgw      xmm1, xmm3
189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
1937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 16
194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_SCALEROWDOWN2_AVX2
2027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Reads 64 pixels, throws half away and writes 32 pixels.
2037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
2057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_ptr, int dst_width) {
206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_ptr
208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride ignored
209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12]       // dst_ptr
210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]       // dst_width
211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
2137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm0, [eax]
2147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm1, [eax + 32]
2157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         eax,  [eax + 64]
2167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
2177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm1, ymm1, 8
2187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm0, ymm0, ymm1
2197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
2207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx], ymm0
2217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         edx, [edx + 32]
2227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         ecx, 32
2237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg          wloop
2247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Blends 64x1 rectangle to 32x1.
2317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
2337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                              uint8* dst_ptr, int dst_width) {
234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         eax, [esp + 4]        // src_ptr
2367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                      // src_stride
2377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         edx, [esp + 12]       // dst_ptr
2387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         ecx, [esp + 16]       // dst_width
239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
2417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm4, ymm4, 15
2427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm4, ymm4, ymm4
2437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpxor       ymm5, ymm5, ymm5      // constant 0
244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  wloop:
2467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm0, [eax]
2477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm1, [eax + 32]
2487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         eax,  [eax + 64]
2497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
2517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmaddubsw  ymm1, ymm1, ymm4
2527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
2537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgw      ymm1, ymm1, ymm5
2547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm0, ymm0, ymm1
2557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
2567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx], ymm0
2587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         edx, [edx + 32]
2597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         ecx, 32
2607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg          wloop
2617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Blends 64x2 rectangle to 32x1.
2687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
2697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
2707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                           uint8* dst_ptr, int dst_width) {
271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
2727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push        esi
2737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         eax, [esp + 4 + 4]    // src_ptr
2747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         esi, [esp + 4 + 8]    // src_stride
2757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         edx, [esp + 4 + 12]   // dst_ptr
2767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         ecx, [esp + 4 + 16]   // dst_width
277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
2797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm4, ymm4, 15
2807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm4, ymm4, ymm4
2817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpxor       ymm5, ymm5, ymm5      // constant 0
282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
2837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  wloop:
2847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm0, [eax]           // average rows
2857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm1, [eax + 32]
2867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb      ymm0, ymm0, [eax + esi]
2877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb      ymm1, ymm1, [eax + esi + 32]
2887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         eax,  [eax + 64]
2897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
2917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpmaddubsw  ymm1, ymm1, ymm4
2927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
2937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgw      ymm1, ymm1, ymm5
2947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm0, ymm0, ymm1
2957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
2967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx], ymm0
2987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         edx, [edx + 32]
2997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         ecx, 32
3007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg          wloop
3017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop         esi
3037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
305ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
3077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_SCALEROWDOWN2_AVX2
308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Point samples 32 pixels to 8 pixels.
3107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
311ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        uint8* dst_ptr, int dst_width) {
313ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_ptr
315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride ignored
316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12]       // dst_ptr
317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]       // dst_width
318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrld      xmm5, 24
320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pslld      xmm5, 16
321ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
3237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
3247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm0, xmm5
327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm1, xmm5
328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8
330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
3337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
334ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 32x4 rectangle to 8x1.
3417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           uint8* dst_ptr, int dst_width) {
344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_ptr
348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // src_stride
349ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 12]   // dst_ptr
350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 16]   // dst_width
351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [esi + esi * 2]  // src_stride * 3
352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm7, 8
354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
355ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
3567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]           // average rows
3577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
3587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + esi]
3597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + esi + 16]
3607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm0, xmm2
361ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm3
3627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + esi * 2]
3637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + esi * 2 + 16]
3647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm4, [eax + edi]
3657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm5, [eax + edi + 16]
366ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
367ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm2, xmm4
368ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm3, xmm5
369ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2
370ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm3
371ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
372ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
373ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8
374ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, xmm1
375ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 8
376ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm7
377ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm3, xmm7
378ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgw      xmm0, xmm2
379ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgw      xmm1, xmm3
380ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm1
381ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
382ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
383ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 8
384ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pand       xmm2, xmm7
385ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgw      xmm0, xmm2
386ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
387ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
388ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
389ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 8]
3907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
391ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
392ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
393ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
394ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
395ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
396ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
397ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
398ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
3997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_SCALEROWDOWN4_AVX2
4007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Point samples 64 pixels to 16 pixels.
4017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
4037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                        uint8* dst_ptr, int dst_width) {
4047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
4057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         eax, [esp + 4]        // src_ptr
4067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                                      // src_stride ignored
4077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         edx, [esp + 12]       // dst_ptr
4087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         ecx, [esp + 16]       // dst_width
4097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
4107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrld      ymm5, ymm5, 24
4117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpslld      ymm5, ymm5, 16
4127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  wloop:
4147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm0, [eax]
4157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm1, [eax + 32]
4167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         eax,  [eax + 64]
4177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand       ymm0, ymm0, ymm5
4187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand       ymm1, ymm1, ymm5
4197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm0, ymm0, ymm1
4207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
4217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm0, ymm0, 8
4227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm0, ymm0, ymm0
4237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
4247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx], xmm0
4257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         edx, [edx + 16]
4267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         ecx, 16
4277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg          wloop
4287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
4307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
4317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
4327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Blends 64x4 rectangle to 16x1.
4357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
4367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
4377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian                           uint8* dst_ptr, int dst_width) {
4387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
4397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push        esi
4407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push        edi
4417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         eax, [esp + 8 + 4]    // src_ptr
4427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         esi, [esp + 8 + 8]    // src_stride
4437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         edx, [esp + 8 + 12]   // dst_ptr
4447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         ecx, [esp + 8 + 16]   // dst_width
4457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         edi, [esi + esi * 2]  // src_stride * 3
4467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
4477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm7, ymm7, 8
4487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  wloop:
4507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm0, [eax]           // average rows
4517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm1, [eax + 32]
4527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb      ymm0, ymm0, [eax + esi]
4537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb      ymm1, ymm1, [eax + esi + 32]
4547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm2, [eax + esi * 2]
4557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm3, [eax + esi * 2 + 32]
4567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb      ymm2, ymm2, [eax + edi]
4577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb      ymm3, ymm3, [eax + edi + 32]
4587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         eax, [eax + 64]
4597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb      ymm0, ymm0, ymm2
4607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgb      ymm1, ymm1, ymm3
4617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
4637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand       ymm3, ymm1, ymm7
4647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm0, ymm0, 8
4657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm1, ymm1, 8
4667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgw      ymm0, ymm0, ymm2
4677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgw      ymm1, ymm1, ymm3
4687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm0, ymm0, ymm1
4697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
4707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
4727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpsrlw      ymm0, ymm0, 8
4737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpavgw      ymm0, ymm0, ymm2
4747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpackuswb   ymm0, ymm0, ymm0
4757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
4767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx], xmm0
4787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         edx, [edx + 16]
4797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         ecx, 16
4807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg          wloop
4817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        edi
4837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop        esi
4847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
4857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
4867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
4877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
4887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_SCALEROWDOWN4_AVX2
4897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
490ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Point samples 32 pixels to 24 pixels.
491ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
492ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Then shuffled to do the scaling.
493ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
4947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
495ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
496ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_ptr, int dst_width) {
497ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
498ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_ptr
499ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride ignored
500ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12]       // dst_ptr
501ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]       // dst_width
502ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, kShuf0
503ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kShuf1
504ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kShuf2
505ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
506ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
5077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
5087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
509ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
510ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm1
511ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    palignr    xmm1, xmm0, 8
512ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm3
513ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm1, xmm4
514ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm2, xmm5
515ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
516ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + 8], xmm1
517ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + 16], xmm2
518ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 24]
519ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 24
520ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
521ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
522ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
523ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
524ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
525ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
526ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 32x2 rectangle to 24x1
527ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
528ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Then shuffled to do the scaling.
529ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
530ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Register usage:
531ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm0 src_row 0
532ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm1 src_row 1
533ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm2 shuf 0
534ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm3 shuf 1
535ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm4 shuf 2
536ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm5 madd 0
537ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm6 madd 1
538ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// xmm7 kRound34
539ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
540ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Note that movdqa+palign may be better than movdqu.
5417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
542ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
543ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                ptrdiff_t src_stride,
544ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                uint8* dst_ptr, int dst_width) {
545ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
546ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
547ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_ptr
548ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]    // src_stride
549ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]   // dst_ptr
550ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // dst_width
551ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, kShuf01
552ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, kShuf11
553ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kShuf21
554ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kMadd01
555ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, kMadd11
556ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm7, kRound34
557ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
558ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
5597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]           // pixels 0..7
5607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi]
561ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
562ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm2
563ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm5
564ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddsw     xmm0, xmm7
565ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 2
566ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
567ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
568ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax + 8]       // pixels 8..15
569ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi + 8]
570ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
571ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm3
572ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm6
573ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddsw     xmm0, xmm7
574ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 2
575ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
576ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + 8], xmm0
5777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax + 16]      // pixels 16..23
5787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi + 16]
579ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
580ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
581ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm4
582ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, kMadd21
583ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm1
584ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddsw     xmm0, xmm7
585ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 2
586ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
587ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + 16], xmm0
588ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 24]
5897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 24
590ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
591ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
592ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
593ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
594ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
595ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
596ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
597ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Note that movdqa+palign may be better than movdqu.
5987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
599ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
600ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                ptrdiff_t src_stride,
601ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                uint8* dst_ptr, int dst_width) {
602ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
603ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
604ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_ptr
605ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]    // src_stride
606ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]   // dst_ptr
607ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // dst_width
608ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, kShuf01
609ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, kShuf11
610ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kShuf21
611ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kMadd01
612ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, kMadd11
613ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm7, kRound34
614ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
615ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
6167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]           // pixels 0..7
6177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi]
618ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm0
619ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
620ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm2
621ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm5
622ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddsw     xmm0, xmm7
623ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 2
624ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
625ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0
626ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm0, [eax + 8]       // pixels 8..15
627ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi + 8]
628ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm0
629ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
630ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm3
631ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm6
632ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddsw     xmm0, xmm7
633ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 2
634ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
635ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + 8], xmm0
6367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax + 16]      // pixels 16..23
6377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi + 16]
638ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
639ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm0
640ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
641ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm4
642ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, kMadd21
643ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm1
644ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddsw     xmm0, xmm7
645ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 2
646ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0
647ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx + 16], xmm0
648ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx+24]
6497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 24
650ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
651ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
652ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
653ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
654ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
655ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
656ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
657ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// 3/8 point sampler
658ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
659ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scale 32 pixels to 12
6607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
661ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
662ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                          uint8* dst_ptr, int dst_width) {
663ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
664ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_ptr
665ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride ignored
666ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12]       // dst_ptr
667ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]       // dst_width
668ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kShuf38a
669ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kShuf38b
670ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
671ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop:
6727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
6737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
674ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 32]
675ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm4
676ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm1, xmm5
677ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusb    xmm0, xmm1
678ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
679ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edx], xmm0  // write 12 pixels
680ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhlps    xmm1, xmm0
681ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edx + 8], xmm1
682ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 12]
6837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 12
684ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop
685ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
686ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
687ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
688ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
689ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
690ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scale 16x3 pixels to 6x1 with interpolation
6917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
692ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
693ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                ptrdiff_t src_stride,
694ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                uint8* dst_ptr, int dst_width) {
695ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
696ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
697ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_ptr
698ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]    // src_stride
699ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]   // dst_ptr
700ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // dst_width
701ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, kShufAc
702ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, kShufAc3
703ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kScaleAc33
704ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm5, xmm5
705ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
706ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop:
7077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
7087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm6, [eax + esi]
709ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhlps    xmm1, xmm0
710ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhlps    xmm7, xmm6
711ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm5
712ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm1, xmm5
713ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm6, xmm5
714ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm7, xmm5
715ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm0, xmm6
716ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm1, xmm7
7177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm6, [eax + esi * 2]
718ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
719ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhlps    xmm7, xmm6
720ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm6, xmm5
721ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm7, xmm5
722ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm0, xmm6
723ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm1, xmm7
724ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
725ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
726ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrldq     xmm0, 2
727ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm6, xmm0
728ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrldq     xmm0, 2
729ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm6, xmm0
730ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm6, xmm2
731ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
732ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
733ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrldq     xmm1, 2
734ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm7, xmm1
735ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrldq     xmm1, 2
736ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm7, xmm1
737ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm7, xmm3
738ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm6, xmm7
739ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
740ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
741ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm6, xmm6
742ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
743ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edx], xmm6           // write 6 pixels
744ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlq      xmm6, 16
745ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edx + 2], xmm6
746ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 6]
7477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 6
748ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop
749ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
750ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
751ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
752ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
753ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
754ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
755ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Scale 16x2 pixels to 6x1 with interpolation
7567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
757ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
758ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                ptrdiff_t src_stride,
759ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                uint8* dst_ptr, int dst_width) {
760ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
761ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
762ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_ptr
763ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]    // src_stride
764ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]   // dst_ptr
765ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // dst_width
766ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, kShufAb0
767ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm3, kShufAb1
768ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kShufAb2
769ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kScaleAb2
770ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
771ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop:
7727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]           // average 2 rows into xmm0
7737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + esi]
774ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax, [eax + 16]
7757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pavgb      xmm0, xmm1
776ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
777ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
778ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm1, xmm2
779ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm6, xmm0
780ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm6, xmm3
781ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm1, xmm6
782ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm4
783ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm1, xmm0
784ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
785ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
786ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm1, xmm1
787ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
788ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edx], xmm1           // write 6 pixels
789ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlq      xmm1, 16
790ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edx + 2], xmm1
791ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 6]
7927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 6
793ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop
794ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
795ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
796ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
797ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
798ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
799ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Reads 16 bytes and accumulates to 16 shorts at a time.
8017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
8027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
803ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
8047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        eax, [esp + 4]   // src_ptr
8057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        edx, [esp + 8]   // dst_ptr
8067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov        ecx, [esp + 12]  // src_width
8077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pxor       xmm5, xmm5
8087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  // sum rows
810ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop:
8117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax]       // read 16 bytes
8127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        eax, [eax + 16]
8137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [edx]       // read 16 words from destination
8147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [edx + 16]
8157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqa     xmm2, xmm3
8167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    punpcklbw  xmm2, xmm5
8177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    punpckhbw  xmm3, xmm5
818ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm0, xmm2        // sum 16 words
819ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddusw    xmm1, xmm3
8207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0       // write 16 words to destination
8217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm1
8227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea        edx, [edx + 32]
823ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 16
824ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         xloop
8257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ret
8267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  }
8277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian}
828ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
8297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#ifdef HAS_SCALEADDROW_AVX2
8307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian// Reads 32 bytes and accumulates to 32 shorts at a time.
8317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
8327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianvoid ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
8337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  __asm {
8347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         eax, [esp + 4]   // src_ptr
8357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         edx, [esp + 8]   // dst_ptr
8367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    mov         ecx, [esp + 12]  // src_width
8377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpxor       ymm5, ymm5, ymm5
8387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  // sum rows
8407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  xloop:
8417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     ymm3, [eax]       // read 32 bytes
8427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         eax, [eax + 32]
8437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
8447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpcklbw  ymm2, ymm3, ymm5
8457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpunpckhbw  ymm3, ymm3, ymm5
8467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
8477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vpaddusw    ymm1, ymm3, [edx + 32]
8487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx], ymm0       // write 32 words to destination
8497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmovdqu     [edx + 32], ymm1
8507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    lea         edx, [edx + 64]
8517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         ecx, 32
8527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    jg          xloop
8537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
8547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vzeroupper
855ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
856ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
857ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
8587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // HAS_SCALEADDROW_AVX2
859ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
860ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear column filtering. SSSE3 version.
8617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
862ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
863ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           int dst_width, int x, int dx) {
864ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
865ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       ebx
866ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
867ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
868ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 12 + 4]    // dst_ptr
869ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 12 + 8]    // src_ptr
870ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12 + 12]   // dst_width
871ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, [esp + 12 + 16]  // x
872ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, [esp + 12 + 20]  // dx
873ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
874ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm5, eax
875ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
876ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm6, 9
877ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     eax, xmm2, 1         // get x0 integer. preroll
878ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 2
879ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         xloop29
880ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
881ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm0, xmm2           // x1 = x0 + dx
882ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, xmm3
883ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm2, xmm0           // x0 x1
884ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm3, xmm3           // dx dx
885ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm3, xmm3           // dx * 2, dx * 2
886ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     edx, xmm2, 3         // get x1 integer. preroll
887ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
888ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 2 Pixel loop.
889ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop2:
890ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm2           // x0, x1 fractions.
891ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, xmm3           // x += dx
892ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
893ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, ebx
894ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 9              // 7 bit fractions.
895ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
896ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm4, ebx
897ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm1, xmm5           // 0011
898ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklwd  xmm0, xmm4
899ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm1, xmm6           // 0..7f and 7f..0
900ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
901ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
902ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
903ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
904ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
905ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       ebx, xmm0
906ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        [edi], bx
907ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 2]
908ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 2               // 2 pixels
909ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        xloop2
910ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
911ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop29:
912ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
913ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        ecx, 2 - 1
914ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         xloop99
915ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
916ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 pixel remainder
917ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
918ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, ebx
919ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 9              // 7 bit fractions.
920ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm2, xmm5           // 0011
921ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm2, xmm6           // 0..7f and 7f..0
922ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm2           // 16 bit
923ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
924ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0           // 8 bits
925ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       ebx, xmm0
926ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        [edi], bl
927ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
928ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99:
929ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
930ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
931ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
932ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        ebx
933ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
934ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
935ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
936ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
937ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 16 pixels, duplicates them and writes 32 pixels.
9387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
939ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
940ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                       int dst_width, int x, int dx) {
941ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
942ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4]    // dst_ptr
943ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8]    // src_ptr
944ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]   // dst_width
945ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
946ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
9477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
948ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 16]
949ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
950ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklbw  xmm0, xmm0
951ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhbw  xmm1, xmm1
9527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
9537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm1
954ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
9557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 32
956ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
957ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
958ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
959ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
960ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
961ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
962ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
9637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
964ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDown2_SSE2(const uint8* src_argb,
965ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            ptrdiff_t src_stride,
966ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                            uint8* dst_argb, int dst_width) {
967ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
968ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_argb
969ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride ignored
970ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12]       // dst_argb
971ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]       // dst_width
972ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
973ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
9747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
9757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
976ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
977ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0xdd
9787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
979ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
9807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
981ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
982ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
983ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
984ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
985ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
986ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
987ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 8x1 rectangle to 4x1.
9887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
989ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
990ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                  ptrdiff_t src_stride,
991ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                  uint8* dst_argb, int dst_width) {
992ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
993ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]        // src_argb
994ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride ignored
995ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12]       // dst_argb
996ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 16]       // dst_width
997ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
998ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
9997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
10007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
1001ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
1002ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0
1003ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88      // even pixels
1004ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm1, 0xdd      // odd pixels
1005ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2
10067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1007ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
10087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
1009ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
1010ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1011ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1012ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1013ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1014ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1015ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends 8x2 rectangle to 4x1.
10167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1017ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1018ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                               ptrdiff_t src_stride,
1019ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                               uint8* dst_argb, int dst_width) {
1020ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1021ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1022ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4 + 4]    // src_argb
1023ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 4 + 8]    // src_stride
1024ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4 + 12]   // dst_argb
1025ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 4 + 16]   // dst_width
1026ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1027ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
10287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
10297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm1, [eax + 16]
10307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm2, [eax + esi]
10317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm3, [eax + esi + 16]
1032ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 32]
1033ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2            // average rows
1034ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm3
1035ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1036ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88      // even pixels
1037ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm1, 0xdd      // odd pixels
1038ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2
10397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1040ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
10417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
1042ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
1043ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1044ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1045ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1046ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1047ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1048ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1049ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 4 pixels at a time.
10507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1051ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1052ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                               int src_stepx,
1053ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                               uint8* dst_argb, int dst_width) {
1054ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1055ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       ebx
1056ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1057ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8 + 4]    // src_argb
1058ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                     // src_stride ignored
1059ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ebx, [esp + 8 + 12]   // src_stepx
1060ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 8 + 16]   // dst_argb
1061ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 20]   // dst_width
1062ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        ebx, [ebx * 4]
1063ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [ebx + ebx * 2]
1064ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1065ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
1066ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, [eax]
1067ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm1, [eax + ebx]
1068ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm0, xmm1
1069ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, [eax + ebx * 2]
1070ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, [eax + edi]
1071ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + ebx * 4]
1072ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm2, xmm3
1073ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklqdq xmm0, xmm2
10747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1075ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
10767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
1077ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
1078ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1079ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1080ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        ebx
1081ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1082ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1083ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1084ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1085ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Blends four 2x2 to 4x1.
10867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1087ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1088ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                  ptrdiff_t src_stride,
1089ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                  int src_stepx,
1090ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                                  uint8* dst_argb, int dst_width) {
1091ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1092ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       ebx
1093ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1094ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1095ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 12 + 4]    // src_argb
1096ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 12 + 8]    // src_stride
1097ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ebx, [esp + 12 + 12]   // src_stepx
1098ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 12 + 16]   // dst_argb
1099ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12 + 20]   // dst_width
1100ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi, [eax + esi]       // row1 pointer
1101ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        ebx, [ebx * 4]
1102ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [ebx + ebx * 2]
1103ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1104ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
1105ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm0, qword ptr [eax]  // row0 4 pairs
1106ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     xmm0, qword ptr [eax + ebx]
1107ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm1, qword ptr [eax + ebx * 2]
1108ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     xmm1, qword ptr [eax + edi]
1109ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + ebx * 4]
1110ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm2, qword ptr [esi]  // row1 4 pairs
1111ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     xmm2, qword ptr [esi + ebx]
1112ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm3, qword ptr [esi + ebx * 2]
1113ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     xmm3, qword ptr [esi + edi]
1114ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        esi,  [esi + ebx * 4]
1115ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2            // average rows
1116ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm1, xmm3
1117ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1118ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm0, xmm1, 0x88      // even pixels
1119ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shufps     xmm2, xmm1, 0xdd      // odd pixels
1120ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pavgb      xmm0, xmm2
11217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
1122ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 16]
11237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4
1124ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
1125ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1126ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1127ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1128ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        ebx
1129ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1130ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1131ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1132ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1133ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Column scaling unfiltered. SSE2 version.
11347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1135ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1136ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                        int dst_width, int x, int dx) {
1137ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1138ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1139ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1140ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 4]    // dst_argb
1141ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // src_argb
1142ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 12]   // dst_width
1143ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, [esp + 8 + 16]  // x
1144ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, [esp + 8 + 20]  // dx
1145ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1146ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
1147ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
1148ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, xmm0
1149ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
1150ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
1151ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, xmm0            // x3 x2 x1 x0
1152ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
1153ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
1154ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1155ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     eax, xmm2, 1          // get x0 integer.
1156ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     edx, xmm2, 3          // get x1 integer.
1157ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1158ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cmp        ecx, 0
1159ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jle        xloop99
1160ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 4
1161ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         xloop49
1162ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1163ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 4 Pixel loop.
1164ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop4:
1165ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1166ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1167ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     eax, xmm2, 5           // get x2 integer.
1168ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     edx, xmm2, 7           // get x3 integer.
1169ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, xmm3             // x += dx
1170ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm0, xmm1             // x0 x1
1171ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1172ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1173ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1174ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
1175ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
1176ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm1, xmm4             // x2 x3
1177ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
1178ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqu     [edi], xmm0
1179ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 16]
11807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 4                 // 4 pixels
1181ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        xloop4
1182ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1183ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop49:
1184ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    test       ecx, 2
1185ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop29
1186ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1187ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 2 Pixels.
1188ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1189ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1190ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     eax, xmm2, 5           // get x2 integer.
1191ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm0, xmm1             // x0 x1
1192ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1193ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edi], xmm0
1194ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 8]
1195ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1196ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop29:
1197ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    test       ecx, 1
1198ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    je         xloop99
1199ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1200ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 Pixels.
1201ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1202ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       dword ptr [edi], xmm0
1203ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99:
1204ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1205ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1206ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1207ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1208ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1209ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1210ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1211ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1212ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// TODO(fbarchard): Port to Neon
1213ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1214ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1215ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuffleColARGB = {
1216ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1217ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1218ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
1219ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1220ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Shuffle table for duplicating 2 fractions into 8 bytes each
1221ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianstatic uvec8 kShuffleFractions = {
1222ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1223ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian};
1224ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
12257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1226ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1227ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                               int dst_width, int x, int dx) {
1228ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1229ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       esi
1230ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    push       edi
1231ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edi, [esp + 8 + 4]    // dst_argb
1232ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        esi, [esp + 8 + 8]    // src_argb
1233ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8 + 12]   // dst_width
1234ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm2, [esp + 8 + 16]  // x
1235ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       xmm3, [esp + 8 + 20]  // dx
1236ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm4, kShuffleColARGB
1237ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm5, kShuffleFractions
1238ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
1239ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm6, 9
1240ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     eax, xmm2, 1         // get x0 integer. preroll
1241ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 2
1242ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         xloop29
1243ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1244ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm0, xmm2           // x1 = x0 + dx
1245ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm0, xmm3
1246ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm2, xmm0           // x0 x1
1247ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm3, xmm3           // dx dx
1248ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm3, xmm3           // dx * 2, dx * 2
1249ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     edx, xmm2, 3         // get x1 integer. preroll
1250ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1251ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 2 Pixel loop.
1252ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  xloop2:
1253ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm2           // x0, x1 fractions.
1254ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    paddd      xmm2, xmm3           // x += dx
1255ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1256ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm1, 9              // 7 bit fractions.
1257ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1258ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm1, xmm5           // 0000000011111111
1259ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm4           // arrange pixels into pairs
1260ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm1, xmm6           // 0..7f and 7f..0
1261ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
1262ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
1263ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
1264ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
1265ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
1266ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       qword ptr [edi], xmm0
1267ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edi, [edi + 8]
1268ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 2               // 2 pixels
1269ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jge        xloop2
1270ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1271ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop29:
1272ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1273ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    add        ecx, 2 - 1
1274ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jl         xloop99
1275ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1276ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    // 1 pixel remainder
1277ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm2, 9              // 7 bit fractions.
1278ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1279ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm2, xmm5           // 00000000
1280ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pshufb     xmm0, xmm4           // arrange pixels into pairs
1281ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pxor       xmm2, xmm6           // 0..7f and 7f..0
1282ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
1283ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    psrlw      xmm0, 7
1284ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
1285ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movd       [edi], xmm0
1286ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1287ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian xloop99:
1288ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1289ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        edi
1290ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    pop        esi
1291ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1292ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1293ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1294ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1295ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Reads 4 pixels, duplicates them and writes 8 pixels.
12967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1297ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianvoid ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1298ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian                           int dst_width, int x, int dx) {
1299ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1300ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        edx, [esp + 4]    // dst_argb
1301ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 8]    // src_argb
1302ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 12]   // dst_width
1303ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1304ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  wloop:
13057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     xmm0, [eax]
1306ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        eax,  [eax + 16]
1307ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    movdqa     xmm1, xmm0
1308ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckldq  xmm0, xmm0
1309ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    punpckhdq  xmm1, xmm1
13107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx], xmm0
13117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    movdqu     [edx + 16], xmm1
1312ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    lea        edx, [edx + 32]
13137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub        ecx, 8
1314ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    jg         wloop
1315ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1316ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1317ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1318ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1319ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1320ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Divide num by div and return as 16.16 fixed point result.
13217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1322ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianint FixedDiv_X86(int num, int div) {
1323ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1324ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]    // num
1325ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cdq                          // extend num to 64 bits
1326ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shld       edx, eax, 16      // 32.16
1327ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shl        eax, 16
1328ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    idiv       dword ptr [esp + 8]
1329ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1330ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1331ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
1332ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1333ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian// Divide num by div and return as 16.16 fixed point result.
13347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian__declspec(naked)
1335ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanianint FixedDiv1_X86(int num, int div) {
1336ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  __asm {
1337ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        eax, [esp + 4]    // num
1338ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    mov        ecx, [esp + 8]    // denom
1339ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    cdq                          // extend num to 64 bits
1340ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shld       edx, eax, 16      // 32.16
1341ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    shl        eax, 16
1342ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        eax, 0x00010001
1343ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sbb        edx, 0
1344ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    sub        ecx, 1
1345ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    idiv       ecx
1346ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian    ret
1347ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian  }
1348ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}
13497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1350ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian
1351ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#ifdef __cplusplus
1352ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}  // extern "C"
1353ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian}  // namespace libyuv
1354ba6c59e9d7d7013b3906b6f4230b663422681848Vignesh Venkatasubramanian#endif
1355