1dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com/*
2dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com *
4dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com *  Use of this source code is governed by a BSD-style license
5dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com *  that can be found in the LICENSE file in the root of the source
6dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com *  tree. An additional intellectual property rights grant can be found
7dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com *  in the file PATENTS. All contributing project authors may
8dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com *  be found in the AUTHORS file in the root of the source tree.
9dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com */
10dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
11dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#include "libyuv/row.h"
12dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
13dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#ifdef __cplusplus
14dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comnamespace libyuv {
15dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comextern "C" {
16dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#endif
17dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
18dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// This module is for Visual C x86.
19dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
20dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
21dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 0 to 9
22dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf0 =
23dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
24dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
25dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
26dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf1 =
27dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
28dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
29dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
30dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf2 =
31dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
32dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
33dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 0 to 10
34dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf01 =
35dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
36dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
37dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf11 =
39dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
40dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
41dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf21 =
43dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
44dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
45dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Coefficients for source bytes 0 to 10
46dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kMadd01 =
47dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
48dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
49dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Coefficients for source bytes 10 to 21
50dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kMadd11 =
51dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
52dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
53dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Coefficients for source bytes 21 to 31
54dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kMadd21 =
55dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
56dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
57dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Coefficients for source bytes 21 to 31
58dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic vec16 kRound34 =
59dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 2, 2, 2, 2, 2, 2, 2, 2 };
60dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
61dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf38a =
62dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
63dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
64dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuf38b =
65dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
66dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
67dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange words 0,3,6 into 0,1,2
68dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAc =
69dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
70dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
71dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange words 0,3,6 into 3,4,5
72dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAc3 =
73dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
74dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
75dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scaling values for boxes of 3x3 and 2x3
76dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec16 kScaleAc33 =
77dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
78dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
79dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange first value for pixels 0,1,2,3,4,5
80dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAb0 =
81dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
82dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
83dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange second value for pixels 0,1,2,3,4,5
84dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAb1 =
85dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
86dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
87dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Arrange third value for pixels 0,1,2,3,4,5
88dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShufAb2 =
89dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
90dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
91dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scaling values for boxes of 3x2 and 2x2
92dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec16 kScaleAb2 =
93dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
94dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
95dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 32 pixels, throws half away and writes 16 pixels.
96dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
97dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
98dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
99dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                        uint8* dst_ptr, int dst_width) {
100dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
101dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_ptr
102dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride ignored
103dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_ptr
104dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
105dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
106c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
107dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
108dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
109dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
110dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
111dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8               // isolate odd pixels.
112dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 8
113dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm1
114dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 16
115dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
116dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
117dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
118dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
119dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
120dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
121dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
122dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
123dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x1 rectangle to 16x1.
124dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
125dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
126dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
127dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                              uint8* dst_ptr, int dst_width) {
128dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
129dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_ptr
130dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride
131dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_ptr
132dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
133dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
134dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm5, 8
135dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
136c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
137dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
138dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
139dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
140dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
141dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
142dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
143dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8
144dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, xmm1
145dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 8
146dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm2, xmm5
147dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm3, xmm5
148dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm0, xmm2
149dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm1, xmm3
150dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm1
151dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
152dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 16
153dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
154dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
155dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
156dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
157dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
158dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
159dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
160dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
161dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x2 rectangle to 16x1.
162dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
163dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
164dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
165dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                           uint8* dst_ptr, int dst_width) {
166dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
167dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
168dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_ptr
169dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 4 + 8]    // src_stride
170dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4 + 12]   // dst_ptr
171dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // dst_width
172dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
173dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm5, 8
174dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
175c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
176dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
177dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
178dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
179dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, [eax + esi]
180dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, [eax + esi + 16]
181dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
182dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2            // average rows
183dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm3
184dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
185dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
186dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8
187dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, xmm1
188dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 8
189dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm2, xmm5
190dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm3, xmm5
191dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm0, xmm2
192dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm1, xmm3
193dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm1
194dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
195dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 16
196dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
197dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
198dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
199dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
200dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
201dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
202dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
203dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
204dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
205dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 32 pixels, throws half away and writes 16 pixels.
206dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
207dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
208dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
209dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                  ptrdiff_t src_stride,
210dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                  uint8* dst_ptr, int dst_width) {
211dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
212dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_ptr
213dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride ignored
214dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_ptr
215dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
216dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
217c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
218dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
219dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm0, [eax]
220dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm1, [eax + 16]
221dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
222dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8               // isolate odd pixels.
223dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 8
224dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm1
225dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 16
226dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     [edx], xmm0
227dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
228dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
229dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
230dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
231dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
232dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
233dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
234dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x1 rectangle to 16x1.
235dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
236dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
2371f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.comvoid ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
2381f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com                                        ptrdiff_t src_stride,
239dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                        uint8* dst_ptr, int dst_width) {
240dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
241dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_ptr
242dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride
243dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_ptr
244dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
245dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
246dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm5, 8
247dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
248c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
249dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
250dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm0, [eax]
251dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm1, [eax + 16]
252dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
253dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
254dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
255dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8
256dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, xmm1
257dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 8
258dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm2, xmm5
259dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm3, xmm5
260dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm0, xmm2
261dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm1, xmm3
262dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm1
263dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
264dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 16
265dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     [edx], xmm0
266dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
267dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
268dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
269dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
270dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
271dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
272dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
273dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x2 rectangle to 16x1.
274dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
275dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
276dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
277dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     ptrdiff_t src_stride,
278dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     uint8* dst_ptr, int dst_width) {
279dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
280dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
281dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_ptr
282dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 4 + 8]    // src_stride
283dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4 + 12]   // dst_ptr
284dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // dst_width
285dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
286dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm5, 8
287dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
288c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
289dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
290dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm0, [eax]
291dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm1, [eax + 16]
292dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm2, [eax + esi]
293dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm3, [eax + esi + 16]
294dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
295dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2            // average rows
296dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm3
297dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
298dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
299dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8
300dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, xmm1
301dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 8
302dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm2, xmm5
303dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm3, xmm5
304dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm0, xmm2
305dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm1, xmm3
306dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm1
307dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
308dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 16
309dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     [edx], xmm0
310dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
311dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
312dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
313dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
314dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
315dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
316dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
317dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
318dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Point samples 32 pixels to 8 pixels.
319dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
320dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
321dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
322dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                        uint8* dst_ptr, int dst_width) {
323dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
324dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_ptr
325dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride ignored
326dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_ptr
327dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
328dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
329dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrld      xmm5, 24
330dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pslld      xmm5, 16
331dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
332c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
333dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
334dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
335dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
336dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
337dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm0, xmm5
338dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm1, xmm5
339dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm1
340dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8
341dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0
342dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 8
343dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx], xmm0
344dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 8]
345dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
346dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
347dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
348dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
349dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
350dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
351dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x4 rectangle to 8x1.
352dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
353dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
354dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
355dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                           uint8* dst_ptr, int dst_width) {
356dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
357dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
358dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       edi
359dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_ptr
360dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 8 + 8]    // src_stride
361dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 8 + 12]   // dst_ptr
362dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 8 + 16]   // dst_width
363dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edi, [esi + esi * 2]  // src_stride * 3
364dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
365dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm7, 8
366dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
367c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
368dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
369dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
370dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
371dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, [eax + esi]
372dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, [eax + esi + 16]
373dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2            // average rows
374dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm3
375dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, [eax + esi * 2]
376dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, [eax + esi * 2 + 16]
377dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm4, [eax + edi]
378dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm5, [eax + edi + 16]
379dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax, [eax + 32]
380dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm2, xmm4
381dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm3, xmm5
382dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2
383dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm3
384dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
385dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
386dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8
387dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, xmm1
388dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 8
389dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm2, xmm7
390dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm3, xmm7
391dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm0, xmm2
392dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm1, xmm3
393dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm1
394dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
395dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
396dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 8
397dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pand       xmm2, xmm7
398dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgw      xmm0, xmm2
399dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0
400dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
401dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 8
402dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx], xmm0
403dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 8]
404dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
405dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
406dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        edi
407dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
408dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
409dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
410dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
411dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
412dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Point samples 32 pixels to 24 pixels.
413dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
414dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Then shuffled to do the scaling.
415dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
416dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Note that movdqa+palign may be better than movdqu.
417dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
418dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
419dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
420dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                          uint8* dst_ptr, int dst_width) {
421dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
422dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_ptr
423dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride ignored
424dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_ptr
425dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
426dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, kShuf0
427dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm4, kShuf1
428dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm5, kShuf2
429dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
430c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
431dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
432dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
433dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
434dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
435dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm1
436dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    palignr    xmm1, xmm0, 8
437dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm3
438dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm1, xmm4
439dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm2, xmm5
440dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx], xmm0
441dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx + 8], xmm1
442dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx + 16], xmm2
443dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 24]
444dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 24
445dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
446dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
447dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
448dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
449dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
450dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
451dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 32x2 rectangle to 24x1
452dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
453dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Then shuffled to do the scaling.
454dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
455dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Register usage:
456dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm0 src_row 0
457dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm1 src_row 1
458dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm2 shuf 0
459dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm3 shuf 1
460dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm4 shuf 2
461dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm5 madd 0
462dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm6 madd 1
463dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// xmm7 kRound34
464dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
465dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Note that movdqa+palign may be better than movdqu.
466dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
467dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
468dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
469dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                ptrdiff_t src_stride,
470dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                uint8* dst_ptr, int dst_width) {
471dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
472dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
473dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_ptr
474dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 4 + 8]    // src_stride
475dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4 + 12]   // dst_ptr
476dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // dst_width
477dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, kShuf01
478dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, kShuf11
479dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm4, kShuf21
480dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm5, kMadd01
481dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm6, kMadd11
482dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm7, kRound34
483dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
484c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
485dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
486dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]           // pixels 0..7
487dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + esi]
488dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm1
489dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm2
490dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm5
491dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddsw     xmm0, xmm7
492dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 2
493dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0
494dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx], xmm0
495dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm0, [eax + 8]       // pixels 8..15
496dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm1, [eax + esi + 8]
497dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm1
498dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm3
499dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm6
500dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddsw     xmm0, xmm7
501dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 2
502dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0
503dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx + 8], xmm0
504dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax + 16]      // pixels 16..23
505dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + esi + 16]
506dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax, [eax + 32]
507dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm1
508dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm4
509dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, kMadd21
510dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm1
511dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddsw     xmm0, xmm7
512dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 2
513dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0
514dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 24
515dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx + 16], xmm0
516dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 24]
517dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
518dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
519dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
520dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
521dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
522dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
523dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
524dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Note that movdqa+palign may be better than movdqu.
525dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
526dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
527dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
528dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                ptrdiff_t src_stride,
529dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                uint8* dst_ptr, int dst_width) {
530dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
531dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
532dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_ptr
533dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 4 + 8]    // src_stride
534dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4 + 12]   // dst_ptr
535dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // dst_width
536dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, kShuf01
537dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, kShuf11
538dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm4, kShuf21
539dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm5, kMadd01
540dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm6, kMadd11
541dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm7, kRound34
542dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
543c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
544dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
545dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]           // pixels 0..7
546dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + esi]
547dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm0
548dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm1
549dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm2
550dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm5
551dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddsw     xmm0, xmm7
552dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 2
553dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0
554dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx], xmm0
555dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm0, [eax + 8]       // pixels 8..15
556dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     xmm1, [eax + esi + 8]
557dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm0
558dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm1
559dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm3
560dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm6
561dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddsw     xmm0, xmm7
562dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 2
563dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0
564dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx + 8], xmm0
565dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax + 16]      // pixels 16..23
566dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + esi + 16]
567dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax, [eax + 32]
568dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm0
569dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm1
570dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm4
571dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, kMadd21
572dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm1
573dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddsw     xmm0, xmm7
574dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 2
575dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0
576dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 24
577dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx + 16], xmm0
578dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx+24]
579dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
580dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
581dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
582dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
583dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
584dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
585dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
586dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// 3/8 point sampler
587dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
588dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scale 32 pixels to 12
589dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
590dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
591dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                          uint8* dst_ptr, int dst_width) {
592dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
593dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_ptr
594dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride ignored
595dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_ptr
596dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
597dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm4, kShuf38a
598dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm5, kShuf38b
599dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
600c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
601dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  xloop:
602dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
603dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
604dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax, [eax + 32]
605dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm4
606dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm1, xmm5
607dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusb    xmm0, xmm1
608dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
609dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 12
610dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edx], xmm0  // write 12 pixels
611dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhlps    xmm1, xmm0
612dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       [edx + 8], xmm1
613dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 12]
614dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         xloop
615dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
616dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
617dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
618dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
619dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
620dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scale 16x3 pixels to 6x1 with interpolation
621dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
622dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
623dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                ptrdiff_t src_stride,
624dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                uint8* dst_ptr, int dst_width) {
625dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
626dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
627dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_ptr
628dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 4 + 8]    // src_stride
629dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4 + 12]   // dst_ptr
630dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // dst_width
631dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, kShufAc
632dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, kShufAc3
633dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm4, kScaleAc33
634dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pxor       xmm5, xmm5
635dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
636c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
637dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  xloop:
638dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
639dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm6, [eax + esi]
640dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhlps    xmm1, xmm0
641dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhlps    xmm7, xmm6
642dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm0, xmm5
643dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm1, xmm5
644dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm6, xmm5
645dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm7, xmm5
646dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm0, xmm6
647dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm1, xmm7
648dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm6, [eax + esi * 2]
649dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax, [eax + 16]
650dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhlps    xmm7, xmm6
651dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm6, xmm5
652dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm7, xmm5
653dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm0, xmm6
654dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm1, xmm7
655dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
656dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
657dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrldq     xmm0, 2
658dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm6, xmm0
659dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrldq     xmm0, 2
660dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm6, xmm0
661dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm6, xmm2
662dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
663dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
664dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrldq     xmm1, 2
665dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm7, xmm1
666dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrldq     xmm1, 2
667dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm7, xmm1
668dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm7, xmm3
669dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm6, xmm7
670dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
671dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
672dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm6, xmm6
673dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
674dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 6
675dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       [edx], xmm6           // write 6 pixels
676dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlq      xmm6, 16
677dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       [edx + 2], xmm6
678dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 6]
679dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         xloop
680dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
681dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
682dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
683dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
684dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
685dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
686dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Scale 16x2 pixels to 6x1 with interpolation
687dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
688dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
689dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                ptrdiff_t src_stride,
690dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                uint8* dst_ptr, int dst_width) {
691dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
692dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
693dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_ptr
694dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 4 + 8]    // src_stride
695dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4 + 12]   // dst_ptr
696dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // dst_width
697dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, kShufAb0
698dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, kShufAb1
699dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm4, kShufAb2
700dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm5, kScaleAb2
701dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
702c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
703dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  xloop:
704dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]           // average 2 rows into xmm0
705dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, [eax + esi]
706dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax, [eax + 16]
707dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
708dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
709dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm1, xmm2
710dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm6, xmm0
711dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm6, xmm3
712dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm1, xmm6
713dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm4
714dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm1, xmm0
715dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
716dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
717dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm1, xmm1
718dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
719dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 6
720dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       [edx], xmm1           // write 6 pixels
721dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlq      xmm1, 16
722dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       [edx + 2], xmm1
723dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 6]
724dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         xloop
725dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
726dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
727dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
728dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
729dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
730dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
731dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 16xN bytes and produces 16 shorts at a time.
732dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
733dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
734dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
735dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                       uint16* dst_ptr, int src_width,
736dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                       int src_height) {
737dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
738dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
739dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       edi
740dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       ebx
741dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       ebp
742dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 16 + 4]   // src_ptr
743dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 16 + 8]   // src_stride
744dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edi, [esp + 16 + 12]  // dst_ptr
745dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16 + 16]  // dst_width
746dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ebx, [esp + 16 + 20]  // height
747dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pxor       xmm4, xmm4
748dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    dec        ebx
749dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
750c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
751dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  xloop:
752dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // first row
753dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [esi]
754dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax, [esi + edx]
755dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, xmm0
756dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm0, xmm4
757dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckhbw  xmm1, xmm4
758dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        esi, [esi + 16]
759dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ebp, ebx
760dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    test       ebp, ebp
761dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    je         ydone
762dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
763dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // sum remaining rows
764c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
765dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  yloop:
766dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, [eax]       // read 16 pixels
767dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax, [eax + edx]  // advance to next row
768dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, xmm2
769dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm2, xmm4
770dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckhbw  xmm3, xmm4
771dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm0, xmm2        // sum 16 words
772dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddusw    xmm1, xmm3
773dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ebp, 1
774dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         yloop
775dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
776c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
777dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  ydone:
778dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edi], xmm0
779dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edi + 16], xmm1
780dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edi, [edi + 32]
781dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
782dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 16
783dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         xloop
784dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
785dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        ebp
786dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        ebx
787dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        edi
788dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
789dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
790dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
791dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
792dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
793dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Bilinear column filtering. SSSE3 version.
794dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// TODO(fbarchard): Port to Neon
79548e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// TODO(fbarchard): Switch the following:
79648e536431342238d6747ca6911c1772f44d90979fbarchard@google.com//    xor        ebx, ebx
79748e536431342238d6747ca6911c1772f44d90979fbarchard@google.com//    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
79848e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// To
79948e536431342238d6747ca6911c1772f44d90979fbarchard@google.com//    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
80048e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// when drmemory bug fixed.
80148e536431342238d6747ca6911c1772f44d90979fbarchard@google.com// https://code.google.com/p/drmemory/issues/detail?id=1396
802dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
803dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
804dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
805dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                           int dst_width, int x, int dx) {
806dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
807dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       ebx
808dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
809dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       edi
810dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edi, [esp + 12 + 4]    // dst_ptr
811dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 12 + 8]    // src_ptr
812dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 12 + 12]   // dst_width
813dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm2, [esp + 12 + 16]  // x
814dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm3, [esp + 12 + 20]  // dx
815dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
816dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm5, eax
817dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
818dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm6, 9
819dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     eax, xmm2, 1         // get x0 integer. preroll
820dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 2
821dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jl         xloop29
822dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
823dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, xmm2           // x1 = x0 + dx
824dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm0, xmm3
825dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm2, xmm0           // x0 x1
826dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm3, xmm3           // dx dx
827dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm3, xmm3           // dx * 2, dx * 2
828dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     edx, xmm2, 3         // get x1 integer. preroll
829dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
830dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // 2 Pixel loop.
831dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    align      4
832dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  xloop2:
833dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, xmm2           // x0, x1 fractions.
834dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm2, xmm3           // x += dx
8350db78ad127aacd528b5a699a8d0f3d6fb01e4a01fbarchard@google.com    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
836dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm0, ebx
837dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 9              // 7 bit fractions.
8380db78ad127aacd528b5a699a8d0f3d6fb01e4a01fbarchard@google.com    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
839dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm4, ebx
840dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm1, xmm5           // 0011
841dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklwd  xmm0, xmm4
842dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pxor       xmm1, xmm6           // 0..7f and 7f..0
843dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
844dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
845dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
846dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
847dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
848dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       ebx, xmm0
849dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        [edi], bx
850dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edi, [edi + 2]
851dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 2               // 2 pixels
852dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jge        xloop2
853dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
854dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    align      4
855dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop29:
856dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
857dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    add        ecx, 2 - 1
858dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jl         xloop99
859dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
860dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // 1 pixel remainder
8610db78ad127aacd528b5a699a8d0f3d6fb01e4a01fbarchard@google.com    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
862dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm0, ebx
863dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm2, 9              // 7 bit fractions.
864dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm2, xmm5           // 0011
865dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pxor       xmm2, xmm6           // 0..7f and 7f..0
866dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm2           // 16 bit
867dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
868dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0           // 8 bits
869dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       ebx, xmm0
870dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        [edi], bl
871dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
872c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
873dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop99:
874dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
875dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        edi
876dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
877dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        ebx
878dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
879dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
880dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
881dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
882dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 16 pixels, duplicates them and writes 32 pixels.
883dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
884dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
885dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
8861f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com                       int dst_width, int x, int dx) {
887dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
888dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4]    // dst_ptr
889dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 8]    // src_ptr
890dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 12]   // dst_width
891dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
892c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
893dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
894dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
895dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 16]
896dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, xmm0
897dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklbw  xmm0, xmm0
898dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckhbw  xmm1, xmm1
899dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 32
900dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
901dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx + 16], xmm1
902dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 32]
903dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
904dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
905dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
906dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
907dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
908dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
909dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
910dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
911dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
912dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDown2_SSE2(const uint8* src_argb,
9131f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com                            ptrdiff_t src_stride,
914dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                            uint8* dst_argb, int dst_width) {
915dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
916dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_argb
917dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride ignored
918dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_argb
919dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
920dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
921c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
922dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
923dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
924dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
925dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
926dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    shufps     xmm0, xmm1, 0xdd
927dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 4
928dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
929dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
930dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
931dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
932dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
933dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
934dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
935dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
936dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 8x1 rectangle to 4x1.
937dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
938dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
939dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
9401f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com                                  ptrdiff_t src_stride,
941dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                  uint8* dst_argb, int dst_width) {
942dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
943dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4]        // src_argb
944dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride ignored
945dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12]       // dst_argb
946dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 16]       // dst_width
947dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
948c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
949dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
950dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
951dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
952dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
953dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0
954dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    shufps     xmm0, xmm1, 0x88      // even pixels
955dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    shufps     xmm2, xmm1, 0xdd      // odd pixels
956dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2
957dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 4
958dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
959dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
960dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
961dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
962dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
963dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
964dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
965dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
966dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends 8x2 rectangle to 4x1.
967dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
968dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
969dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
970dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                               ptrdiff_t src_stride,
971dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                               uint8* dst_argb, int dst_width) {
972dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
973dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
974dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 4 + 4]    // src_argb
975dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 4 + 8]    // src_stride
976dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4 + 12]   // dst_argb
977dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 4 + 16]   // dst_width
978dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
979c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
980dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
981dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
982dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, [eax + 16]
983dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, [eax + esi]
984dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm3, [eax + esi + 16]
985dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 32]
986dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2            // average rows
987dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm3
988dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
989dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    shufps     xmm0, xmm1, 0x88      // even pixels
990dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    shufps     xmm2, xmm1, 0xdd      // odd pixels
991dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2
992dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 4
993dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
994dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
995dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
996dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
997dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
998dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
999dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
1000dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
1001dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1002dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 4 pixels at a time.
1003dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: dst_argb 16 byte aligned.
1004dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
1005dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1006dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                               int src_stepx,
1007dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                               uint8* dst_argb, int dst_width) {
1008dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
1009dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       ebx
1010dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       edi
1011dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 8 + 4]    // src_argb
1012dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                     // src_stride ignored
1013dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ebx, [esp + 8 + 12]   // src_stepx
1014dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 8 + 16]   // dst_argb
1015dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 8 + 20]   // dst_width
1016dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        ebx, [ebx * 4]
1017dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edi, [ebx + ebx * 2]
1018dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1019c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1020dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
1021dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm0, [eax]
1022dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm1, [eax + ebx]
1023dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm0, xmm1
1024dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm2, [eax + ebx * 2]
1025dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm3, [eax + edi]
1026dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + ebx * 4]
1027dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm2, xmm3
1028dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklqdq xmm0, xmm2
1029dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 4
1030dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
1031dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
1032dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
1033dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1034dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        edi
1035dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        ebx
1036dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
1037dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
1038dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
1039dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1040dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Blends four 2x2 to 4x1.
1041dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: dst_argb 16 byte aligned.
1042dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
1043dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1044dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                  ptrdiff_t src_stride,
1045dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                  int src_stepx,
1046dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                                  uint8* dst_argb, int dst_width) {
1047dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
1048dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       ebx
1049dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
1050dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       edi
1051dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 12 + 4]    // src_argb
1052dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 12 + 8]    // src_stride
1053dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ebx, [esp + 12 + 12]   // src_stepx
1054dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 12 + 16]   // dst_argb
1055dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 12 + 20]   // dst_width
1056dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        esi, [eax + esi]       // row1 pointer
1057dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        ebx, [ebx * 4]
1058dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edi, [ebx + ebx * 2]
1059dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1060c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1061dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
1062dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       xmm0, qword ptr [eax]  // row0 4 pairs
1063dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhps     xmm0, qword ptr [eax + ebx]
1064dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       xmm1, qword ptr [eax + ebx * 2]
1065dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhps     xmm1, qword ptr [eax + edi]
1066dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + ebx * 4]
1067dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       xmm2, qword ptr [esi]  // row1 4 pairs
1068dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhps     xmm2, qword ptr [esi + ebx]
1069dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       xmm3, qword ptr [esi + ebx * 2]
1070dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhps     xmm3, qword ptr [esi + edi]
1071dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        esi,  [esi + ebx * 4]
1072dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2            // average rows
1073dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm1, xmm3
1074dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1075dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    shufps     xmm0, xmm1, 0x88      // even pixels
1076dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    shufps     xmm2, xmm1, 0xdd      // odd pixels
1077dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pavgb      xmm0, xmm2
1078dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 4
1079dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
1080dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 16]
1081dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
1082dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1083dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        edi
1084dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
1085dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        ebx
1086dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
1087dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
1088dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
1089dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1090dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Column scaling unfiltered. SSE2 version.
1091dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
1092dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1093dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                        int dst_width, int x, int dx) {
1094dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
1095dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       edi
1096dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
1097dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edi, [esp + 8 + 4]    // dst_argb
1098dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 8 + 8]    // src_argb
1099dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 8 + 12]   // dst_width
1100dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm2, [esp + 8 + 16]  // x
1101dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm3, [esp + 8 + 20]  // dx
1102dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1103dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
1104dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
1105dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm2, xmm0
1106dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
1107dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
1108dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm2, xmm0            // x3 x2 x1 x0
1109dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
1110dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
1111dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1112dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     eax, xmm2, 1          // get x0 integer.
1113dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     edx, xmm2, 3          // get x1 integer.
1114dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1115dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    cmp        ecx, 0
1116dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jle        xloop99
1117dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 4
1118dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jl         xloop49
1119dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1120dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // 4 Pixel loop.
1121dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    align      4
1122dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop4:
1123dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1124dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1125dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     eax, xmm2, 5           // get x2 integer.
1126dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     edx, xmm2, 7           // get x3 integer.
1127dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm2, xmm3             // x += dx
1128dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm0, xmm1             // x0 x1
1129dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1130dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1131dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1132dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
1133dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
1134dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm1, xmm4             // x2 x3
1135dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
1136dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 4                 // 4 pixels
1137dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqu     [edi], xmm0
1138dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edi, [edi + 16]
1139dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jge        xloop4
1140dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1141dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    align      4
1142dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop49:
1143dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    test       ecx, 2
1144dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    je         xloop29
1145dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1146dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // 2 Pixels.
1147dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1148dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1149dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     eax, xmm2, 5           // get x2 integer.
1150dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm0, xmm1             // x0 x1
1151dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1152dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edi], xmm0
1153dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edi, [edi + 8]
1154dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1155dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop29:
1156dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    test       ecx, 1
1157dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    je         xloop99
1158dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1159dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // 1 Pixels.
1160dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1161dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       dword ptr [edi], xmm0
1162dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    align      4
1163dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop99:
1164dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1165dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
1166dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        edi
1167dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
1168dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
1169dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
1170dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1171dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1172dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// TODO(fbarchard): Port to Neon
1173dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1174dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1175dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuffleColARGB = {
1176dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1177dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1178dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com};
1179dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1180dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Shuffle table for duplicating 2 fractions into 8 bytes each
1181dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comstatic uvec8 kShuffleFractions = {
1182dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1183dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com};
1184dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1185dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
1186dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1187dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com                               int dst_width, int x, int dx) {
1188dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
1189dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       esi
1190dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    push       edi
1191dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edi, [esp + 8 + 4]    // dst_argb
1192dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        esi, [esp + 8 + 8]    // src_argb
1193dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 8 + 12]   // dst_width
1194dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm2, [esp + 8 + 16]  // x
1195dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       xmm3, [esp + 8 + 20]  // dx
1196dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm4, kShuffleColARGB
1197dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm5, kShuffleFractions
1198dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
1199dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm6, 9
1200dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     eax, xmm2, 1         // get x0 integer. preroll
1201dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 2
1202dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jl         xloop29
1203dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1204dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, xmm2           // x1 = x0 + dx
1205dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm0, xmm3
1206dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm2, xmm0           // x0 x1
1207dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm3, xmm3           // dx dx
1208dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm3, xmm3           // dx * 2, dx * 2
1209dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     edx, xmm2, 3         // get x1 integer. preroll
1210dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1211dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // 2 Pixel loop.
1212dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    align      4
1213dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  xloop2:
1214dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, xmm2           // x0, x1 fractions.
1215dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    paddd      xmm2, xmm3           // x += dx
1216dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1217dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm1, 9              // 7 bit fractions.
1218dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1219dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm1, xmm5           // 0000000011111111
1220dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm4           // arrange pixels into pairs
1221dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pxor       xmm1, xmm6           // 0..7f and 7f..0
1222dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
1223dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
1224dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
1225dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
1226dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
1227dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       qword ptr [edi], xmm0
1228dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edi, [edi + 8]
1229dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 2               // 2 pixels
1230dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jge        xloop2
1231dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1232dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    align      4
1233dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop29:
1234dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1235dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    add        ecx, 2 - 1
1236dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jl         xloop99
1237dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1238dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    // 1 pixel remainder
1239dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm2, 9              // 7 bit fractions.
1240dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1241dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm2, xmm5           // 00000000
1242dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pshufb     xmm0, xmm4           // arrange pixels into pairs
1243dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pxor       xmm2, xmm6           // 0..7f and 7f..0
1244dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
1245dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    psrlw      xmm0, 7
1246dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
1247dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movd       [edi], xmm0
1248dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1249c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1250dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com xloop99:
1251dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1252dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        edi
1253dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    pop        esi
1254dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
1255dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
1256dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
1257dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1258dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Reads 4 pixels, duplicates them and writes 8 pixels.
1259dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1260dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com__declspec(naked) __declspec(align(16))
1261dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.comvoid ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
12621f923e3ea6de7afd9380c73f60a2f3e7b0588811fbarchard@google.com                           int dst_width, int x, int dx) {
1263dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  __asm {
1264dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        edx, [esp + 4]    // dst_argb
1265dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        eax, [esp + 8]    // src_argb
1266dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    mov        ecx, [esp + 12]   // dst_width
1267dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1268c2295807bdcfcc45d932f228f0ed3f7124005de6fbarchard@google.com    align      4
1269dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  wloop:
1270dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm0, [eax]
1271dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        eax,  [eax + 16]
1272dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     xmm1, xmm0
1273dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckldq  xmm0, xmm0
1274dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    punpckhdq  xmm1, xmm1
1275dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    sub        ecx, 8
1276dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx], xmm0
1277dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    movdqa     [edx + 16], xmm1
1278dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    lea        edx, [edx + 32]
1279dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    jg         wloop
1280dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1281dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com    ret
1282dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com  }
1283dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}
1284dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
12855dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com// Divide num by div and return as 16.16 fixed point result.
12865dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com__declspec(naked) __declspec(align(16))
12875dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.comint FixedDiv_X86(int num, int div) {
12885dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com  __asm {
12895dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    mov        eax, [esp + 4]    // num
12905dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    cdq                          // extend num to 64 bits
12915dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    shld       edx, eax, 16      // 32.16
12925dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    shl        eax, 16
12935dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    idiv       dword ptr [esp + 8]
12945dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    ret
12955dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com  }
12965dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com}
12975dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com
12985dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com// Divide num by div and return as 16.16 fixed point result.
12995dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com__declspec(naked) __declspec(align(16))
13005dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.comint FixedDiv1_X86(int num, int div) {
13015dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com  __asm {
13025dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    mov        eax, [esp + 4]    // num
13035dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    mov        ecx, [esp + 8]    // denom
13045dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    cdq                          // extend num to 64 bits
13055dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    shld       edx, eax, 16      // 32.16
13065dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    shl        eax, 16
13075dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    sub        eax, 0x00010001
13085dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    sbb        edx, 0
13095dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    sub        ecx, 1
13105dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    idiv       ecx
13115dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com    ret
13125dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com  }
13135dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com}
13145dba58cb1ed4117f491267f68351a6079eaed667fbarchard@google.com
1315dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
1316dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com
1317dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#ifdef __cplusplus
1318dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}  // extern "C"
1319dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com}  // namespace libyuv
1320dbe4814361fb8fcbc462bbe45a2f39360e14a982fbarchard@google.com#endif
1321