scale.c revision 1b362b15af34006e6a11974088a46d42b903418e
1/*
2 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "third_party/libyuv/include/libyuv/scale.h"
12
13#include <assert.h>
14#include <string.h>
15
16#include "third_party/libyuv/include/libyuv/cpu_id.h"
17#include "third_party/libyuv/source/row.h"
18
19#ifdef __cplusplus
20namespace libyuv {
21extern "C" {
22#endif
23
24/*
25 * Note: Defining YUV_DISABLE_ASM allows to use c version.
26 */
27//#define YUV_DISABLE_ASM
28
29#if defined(_MSC_VER)
30#define ALIGN16(var) __declspec(align(16)) var
31#else
32#define ALIGN16(var) var __attribute__((aligned(16)))
33#endif
34
35// Note: A Neon reference manual
36// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
37// Note: Some SSE2 reference manuals
38// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
39
40// Set the following flag to true to revert to only
41// using the reference implementation ScalePlaneBox(), and
42// NOT the optimized versions. Useful for debugging and
43// when comparing the quality of the resulting YUV planes
44// as produced by the optimized and non-optimized versions.
45
46static int use_reference_impl_ = 0;
47
48void SetUseReferenceImpl(int use) {
49  use_reference_impl_ = use;
50}
51
52// ScaleRowDown2Int also used by planar functions
53
54/**
55 * NEON downscalers with interpolation.
56 *
57 * Provided by Fritz Koenig
58 *
59 */
60
61#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
62#define HAS_SCALEROWDOWN2_NEON
63void ScaleRowDown2_NEON(const uint8* src_ptr, int  src_stride,
64                        uint8* dst, int dst_width) {
65  asm volatile (
66    "1:                                        \n"
67    "vld2.u8    {q0,q1}, [%0]!                 \n"  // load even pixels into q0, odd into q1
68    "vst1.u8    {q0}, [%1]!                    \n"  // store even pixels
69    "subs       %2, %2, #16                    \n"  // 16 processed per loop
70    "bhi        1b                             \n"
71    : "+r"(src_ptr),          // %0
72      "+r"(dst),              // %1
73      "+r"(dst_width)         // %2
74    :
75    : "q0", "q1"              // Clobber List
76  );
77}
78
79void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
80                           uint8* dst, int dst_width) {
81  asm volatile (
82    "add        %1, %0                         \n"  // change the stride to row 2 pointer
83    "1:                                        \n"
84    "vld1.u8    {q0,q1}, [%0]!                 \n"  // load row 1 and post increment
85    "vld1.u8    {q2,q3}, [%1]!                 \n"  // load row 2 and post increment
86    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
87    "vpaddl.u8  q1, q1                         \n"
88    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent, add row 1 to row 2
89    "vpadal.u8  q1, q3                         \n"
90    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
91    "vrshrn.u16 d1, q1, #2                     \n"
92    "vst1.u8    {q0}, [%2]!                    \n"
93    "subs       %3, %3, #16                    \n"  // 16 processed per loop
94    "bhi        1b                             \n"
95    : "+r"(src_ptr),          // %0
96      "+r"(src_stride),       // %1
97      "+r"(dst),              // %2
98      "+r"(dst_width)         // %3
99    :
100    : "q0", "q1", "q2", "q3"     // Clobber List
101   );
102}
103
104#define HAS_SCALEROWDOWN4_NEON
105static void ScaleRowDown4_NEON(const uint8* src_ptr, int src_stride,
106                               uint8* dst_ptr, int dst_width) {
107  asm volatile (
108    "1:                                        \n"
109    "vld2.u8    {d0, d1}, [%0]!                \n"
110    "vtrn.u8    d1, d0                         \n"
111    "vshrn.u16  d0, q0, #8                     \n"
112    "vst1.u32   {d0[1]}, [%1]!                 \n"
113
114    "subs       %2, #4                         \n"
115    "bhi        1b                             \n"
116    : "+r"(src_ptr),          // %0
117      "+r"(dst_ptr),          // %1
118      "+r"(dst_width)         // %2
119    :
120    : "q0", "q1", "memory", "cc"
121  );
122}
123
124static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
125                                  uint8* dst_ptr, int dst_width) {
126  asm volatile (
127    "add        r4, %0, %3                     \n"
128    "add        r5, r4, %3                     \n"
129    "add        %3, r5, %3                     \n"
130    "1:                                        \n"
131    "vld1.u8    {q0}, [%0]!                    \n"   // load up 16x4 block of input data
132    "vld1.u8    {q1}, [r4]!                    \n"
133    "vld1.u8    {q2}, [r5]!                    \n"
134    "vld1.u8    {q3}, [%3]!                    \n"
135
136    "vpaddl.u8  q0, q0                         \n"
137    "vpadal.u8  q0, q1                         \n"
138    "vpadal.u8  q0, q2                         \n"
139    "vpadal.u8  q0, q3                         \n"
140
141    "vpaddl.u16 q0, q0                         \n"
142
143    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
144
145    "vmovn.u16  d0, q0                         \n"
146    "vst1.u32   {d0[0]}, [%1]!                 \n"
147
148    "subs       %2, #4                         \n"
149    "bhi        1b                             \n"
150
151    : "+r"(src_ptr),          // %0
152      "+r"(dst_ptr),          // %1
153      "+r"(dst_width)         // %2
154    : "r"(src_stride)         // %3
155    : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
156  );
157}
158
159#define HAS_SCALEROWDOWN34_NEON
160// Down scale from 4 to 3 pixels.  Use the neon multilane read/write
161//  to load up the every 4th pixel into a 4 different registers.
162// Point samples 32 pixels to 24 pixels.
163static void ScaleRowDown34_NEON(const uint8* src_ptr, int src_stride,
164                                uint8* dst_ptr, int dst_width) {
165  asm volatile (
166    "1:                                        \n"
167    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
168    "vmov         d2, d3                       \n" // order needs to be d0, d1, d2
169    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
170    "subs         %2, #24                      \n"
171    "bhi          1b                           \n"
172    : "+r"(src_ptr),          // %0
173      "+r"(dst_ptr),          // %1
174      "+r"(dst_width)         // %2
175    :
176    : "d0", "d1", "d2", "d3", "memory", "cc"
177  );
178}
179
180static void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, int src_stride,
181                                      uint8* dst_ptr, int dst_width) {
182  asm volatile (
183    "vmov.u8      d24, #3                      \n"
184    "add          %3, %0                       \n"
185    "1:                                        \n"
186    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
187    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
188
189    // filter src line 0 with src line 1
190    // expand chars to shorts to allow for room
191    // when adding lines together
192    "vmovl.u8     q8, d4                       \n"
193    "vmovl.u8     q9, d5                       \n"
194    "vmovl.u8     q10, d6                      \n"
195    "vmovl.u8     q11, d7                      \n"
196
197    // 3 * line_0 + line_1
198    "vmlal.u8     q8, d0, d24                  \n"
199    "vmlal.u8     q9, d1, d24                  \n"
200    "vmlal.u8     q10, d2, d24                 \n"
201    "vmlal.u8     q11, d3, d24                 \n"
202
203    // (3 * line_0 + line_1) >> 2
204    "vqrshrn.u16  d0, q8, #2                   \n"
205    "vqrshrn.u16  d1, q9, #2                   \n"
206    "vqrshrn.u16  d2, q10, #2                  \n"
207    "vqrshrn.u16  d3, q11, #2                  \n"
208
209    // a0 = (src[0] * 3 + s[1] * 1) >> 2
210    "vmovl.u8     q8, d1                       \n"
211    "vmlal.u8     q8, d0, d24                  \n"
212    "vqrshrn.u16  d0, q8, #2                   \n"
213
214    // a1 = (src[1] * 1 + s[2] * 1) >> 1
215    "vrhadd.u8    d1, d1, d2                   \n"
216
217    // a2 = (src[2] * 1 + s[3] * 3) >> 2
218    "vmovl.u8     q8, d2                       \n"
219    "vmlal.u8     q8, d3, d24                  \n"
220    "vqrshrn.u16  d2, q8, #2                   \n"
221
222    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
223
224    "subs         %2, #24                      \n"
225    "bhi          1b                           \n"
226    : "+r"(src_ptr),          // %0
227      "+r"(dst_ptr),          // %1
228      "+r"(dst_width),        // %2
229      "+r"(src_stride)        // %3
230    :
231    : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
232  );
233}
234
235static void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, int src_stride,
236                                      uint8* dst_ptr, int dst_width) {
237  asm volatile (
238    "vmov.u8      d24, #3                      \n"
239    "add          %3, %0                       \n"
240    "1:                                        \n"
241    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n" // src line 0
242    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n" // src line 1
243
244    // average src line 0 with src line 1
245    "vrhadd.u8    q0, q0, q2                   \n"
246    "vrhadd.u8    q1, q1, q3                   \n"
247
248    // a0 = (src[0] * 3 + s[1] * 1) >> 2
249    "vmovl.u8     q3, d1                       \n"
250    "vmlal.u8     q3, d0, d24                  \n"
251    "vqrshrn.u16  d0, q3, #2                   \n"
252
253    // a1 = (src[1] * 1 + s[2] * 1) >> 1
254    "vrhadd.u8    d1, d1, d2                   \n"
255
256    // a2 = (src[2] * 1 + s[3] * 3) >> 2
257    "vmovl.u8     q3, d2                       \n"
258    "vmlal.u8     q3, d3, d24                  \n"
259    "vqrshrn.u16  d2, q3, #2                   \n"
260
261    "vst3.u8      {d0, d1, d2}, [%1]!          \n"
262
263    "subs         %2, #24                      \n"
264    "bhi          1b                           \n"
265    : "+r"(src_ptr),          // %0
266      "+r"(dst_ptr),          // %1
267      "+r"(dst_width),        // %2
268      "+r"(src_stride)        // %3
269    :
270    : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
271  );
272}
273
274#define HAS_SCALEROWDOWN38_NEON
275const uint8 shuf38[16] __attribute__ ((aligned(16))) =
276  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
277const uint8 shuf38_2[16] __attribute__ ((aligned(16))) =
278  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
279const unsigned short mult38_div6[8] __attribute__ ((aligned(16))) =
280  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
281    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
282const unsigned short mult38_div9[8] __attribute__ ((aligned(16))) =
283  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
284    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
285
286// 32 -> 12
287static void ScaleRowDown38_NEON(const uint8* src_ptr, int src_stride,
288                                uint8* dst_ptr, int dst_width) {
289  asm volatile (
290    "vld1.u8      {q3}, [%3]                   \n"
291    "1:                                        \n"
292    "vld1.u8      {d0, d1, d2, d3}, [%0]!      \n"
293    "vtbl.u8      d4, {d0, d1, d2, d3}, d6     \n"
294    "vtbl.u8      d5, {d0, d1, d2, d3}, d7     \n"
295    "vst1.u8      {d4}, [%1]!                  \n"
296    "vst1.u32     {d5[0]}, [%1]!               \n"
297    "subs         %2, #12                      \n"
298    "bhi          1b                           \n"
299    : "+r"(src_ptr),          // %0
300      "+r"(dst_ptr),          // %1
301      "+r"(dst_width)         // %2
302    : "r"(shuf38)             // %3
303    : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
304  );
305}
306
307// 32x3 -> 12x1
308static void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, int src_stride,
309                                      uint8* dst_ptr, int dst_width) {
310  asm volatile (
311    "vld1.u16     {q13}, [%4]                  \n"
312    "vld1.u8      {q14}, [%5]                  \n"
313    "vld1.u8      {q15}, [%6]                  \n"
314    "add          r4, %0, %3, lsl #1           \n"
315    "add          %3, %0                       \n"
316    "1:                                        \n"
317
318    // d0 = 00 40 01 41 02 42 03 43
319    // d1 = 10 50 11 51 12 52 13 53
320    // d2 = 20 60 21 61 22 62 23 63
321    // d3 = 30 70 31 71 32 72 33 73
322    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
323    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
324    "vld4.u8      {d16, d17, d18, d19}, [r4]!  \n"
325
326    // Shuffle the input data around to get align the data
327    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
328    // d0 = 00 10 01 11 02 12 03 13
329    // d1 = 40 50 41 51 42 52 43 53
330    "vtrn.u8      d0, d1                       \n"
331    "vtrn.u8      d4, d5                       \n"
332    "vtrn.u8      d16, d17                     \n"
333
334    // d2 = 20 30 21 31 22 32 23 33
335    // d3 = 60 70 61 71 62 72 63 73
336    "vtrn.u8      d2, d3                       \n"
337    "vtrn.u8      d6, d7                       \n"
338    "vtrn.u8      d18, d19                     \n"
339
340    // d0 = 00+10 01+11 02+12 03+13
341    // d2 = 40+50 41+51 42+52 43+53
342    "vpaddl.u8    q0, q0                       \n"
343    "vpaddl.u8    q2, q2                       \n"
344    "vpaddl.u8    q8, q8                       \n"
345
346    // d3 = 60+70 61+71 62+72 63+73
347    "vpaddl.u8    d3, d3                       \n"
348    "vpaddl.u8    d7, d7                       \n"
349    "vpaddl.u8    d19, d19                     \n"
350
351    // combine source lines
352    "vadd.u16     q0, q2                       \n"
353    "vadd.u16     q0, q8                       \n"
354    "vadd.u16     d4, d3, d7                   \n"
355    "vadd.u16     d4, d19                      \n"
356
357    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
358    //             + s[6 + st * 1] + s[7 + st * 1]
359    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
360    "vqrdmulh.s16 q2, q13                      \n"
361    "vmovn.u16    d4, q2                       \n"
362
363    // Shuffle 2,3 reg around so that 2 can be added to the
364    //  0,1 reg and 3 can be added to the 4,5 reg.  This
365    //  requires expanding from u8 to u16 as the 0,1 and 4,5
366    //  registers are already expanded.  Then do transposes
367    //  to get aligned.
368    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
369    "vmovl.u8     q1, d2                       \n"
370    "vmovl.u8     q3, d6                       \n"
371    "vmovl.u8     q9, d18                      \n"
372
373    // combine source lines
374    "vadd.u16     q1, q3                       \n"
375    "vadd.u16     q1, q9                       \n"
376
377    // d4 = xx 20 xx 30 xx 22 xx 32
378    // d5 = xx 21 xx 31 xx 23 xx 33
379    "vtrn.u32     d2, d3                       \n"
380
381    // d4 = xx 20 xx 21 xx 22 xx 23
382    // d5 = xx 30 xx 31 xx 32 xx 33
383    "vtrn.u16     d2, d3                       \n"
384
385    // 0+1+2, 3+4+5
386    "vadd.u16     q0, q1                       \n"
387
388    // Need to divide, but can't downshift as the the value
389    //  isn't a power of 2.  So multiply by 65536 / n
390    //  and take the upper 16 bits.
391    "vqrdmulh.s16 q0, q15                      \n"
392
393    // Align for table lookup, vtbl requires registers to
394    //  be adjacent
395    "vmov.u8      d2, d4                       \n"
396
397    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
398    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
399
400    "vst1.u8      {d3}, [%1]!                  \n"
401    "vst1.u32     {d4[0]}, [%1]!               \n"
402    "subs         %2, #12                      \n"
403    "bhi          1b                           \n"
404    : "+r"(src_ptr),          // %0
405      "+r"(dst_ptr),          // %1
406      "+r"(dst_width),        // %2
407      "+r"(src_stride)        // %3
408    : "r"(mult38_div6),       // %4
409      "r"(shuf38_2),          // %5
410      "r"(mult38_div9)        // %6
411    : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
412      "q13", "q14", "q15", "memory", "cc"
413  );
414}
415
416// 32x2 -> 12x1
417static void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, int src_stride,
418                                      uint8* dst_ptr, int dst_width) {
419  asm volatile (
420    "vld1.u16     {q13}, [%4]                  \n"
421    "vld1.u8      {q14}, [%5]                  \n"
422    "add          %3, %0                       \n"
423    "1:                                        \n"
424
425    // d0 = 00 40 01 41 02 42 03 43
426    // d1 = 10 50 11 51 12 52 13 53
427    // d2 = 20 60 21 61 22 62 23 63
428    // d3 = 30 70 31 71 32 72 33 73
429    "vld4.u8      {d0, d1, d2, d3}, [%0]!      \n"
430    "vld4.u8      {d4, d5, d6, d7}, [%3]!      \n"
431
432    // Shuffle the input data around to get align the data
433    //  so adjacent data can be added.  0,1 - 2,3 - 4,5 - 6,7
434    // d0 = 00 10 01 11 02 12 03 13
435    // d1 = 40 50 41 51 42 52 43 53
436    "vtrn.u8      d0, d1                       \n"
437    "vtrn.u8      d4, d5                       \n"
438
439    // d2 = 20 30 21 31 22 32 23 33
440    // d3 = 60 70 61 71 62 72 63 73
441    "vtrn.u8      d2, d3                       \n"
442    "vtrn.u8      d6, d7                       \n"
443
444    // d0 = 00+10 01+11 02+12 03+13
445    // d2 = 40+50 41+51 42+52 43+53
446    "vpaddl.u8    q0, q0                       \n"
447    "vpaddl.u8    q2, q2                       \n"
448
449    // d3 = 60+70 61+71 62+72 63+73
450    "vpaddl.u8    d3, d3                       \n"
451    "vpaddl.u8    d7, d7                       \n"
452
453    // combine source lines
454    "vadd.u16     q0, q2                       \n"
455    "vadd.u16     d4, d3, d7                   \n"
456
457    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
458    "vqrshrn.u16  d4, q2, #2                   \n"
459
460    // Shuffle 2,3 reg around so that 2 can be added to the
461    //  0,1 reg and 3 can be added to the 4,5 reg.  This
462    //  requires expanding from u8 to u16 as the 0,1 and 4,5
463    //  registers are already expanded.  Then do transposes
464    //  to get aligned.
465    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
466    "vmovl.u8     q1, d2                       \n"
467    "vmovl.u8     q3, d6                       \n"
468
469    // combine source lines
470    "vadd.u16     q1, q3                       \n"
471
472    // d4 = xx 20 xx 30 xx 22 xx 32
473    // d5 = xx 21 xx 31 xx 23 xx 33
474    "vtrn.u32     d2, d3                       \n"
475
476    // d4 = xx 20 xx 21 xx 22 xx 23
477    // d5 = xx 30 xx 31 xx 32 xx 33
478    "vtrn.u16     d2, d3                       \n"
479
480    // 0+1+2, 3+4+5
481    "vadd.u16     q0, q1                       \n"
482
483    // Need to divide, but can't downshift as the the value
484    //  isn't a power of 2.  So multiply by 65536 / n
485    //  and take the upper 16 bits.
486    "vqrdmulh.s16 q0, q13                      \n"
487
488    // Align for table lookup, vtbl requires registers to
489    //  be adjacent
490    "vmov.u8      d2, d4                       \n"
491
492    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
493    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
494
495    "vst1.u8      {d3}, [%1]!                  \n"
496    "vst1.u32     {d4[0]}, [%1]!               \n"
497    "subs         %2, #12                      \n"
498    "bhi          1b                           \n"
499    : "+r"(src_ptr),          // %0
500      "+r"(dst_ptr),          // %1
501      "+r"(dst_width),        // %2
502      "+r"(src_stride)        // %3
503    : "r"(mult38_div6),       // %4
504      "r"(shuf38_2)           // %5
505    : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
506  );
507}
508
509/**
510 * SSE2 downscalers with interpolation.
511 *
512 * Provided by Frank Barchard (fbarchard@google.com)
513 *
514 */
515
516// Constants for SSE2 code
517#elif (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) && \
518    !defined(YUV_DISABLE_ASM)
519#if defined(_MSC_VER)
520#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
521#elif (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && defined(__i386__)
522#define TALIGN16(t, var) t var __attribute__((aligned(16)))
523#else
524#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
525#endif
526
527#if (defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__)) && \
528    defined(__i386__)
529#define DECLARE_FUNCTION(name)                                                 \
530    ".text                                     \n"                             \
531    ".globl _" #name "                         \n"                             \
532"_" #name ":                                   \n"
533#else
534#define DECLARE_FUNCTION(name)                                                 \
535    ".text                                     \n"                             \
536    ".global " #name "                         \n"                             \
537#name ":                                       \n"
538#endif
539
540
541// Offsets for source bytes 0 to 9
542//extern "C"
543TALIGN16(const uint8, shuf0[16]) =
544  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
545
546// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
547//extern "C"
548TALIGN16(const uint8, shuf1[16]) =
549  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
550
551// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
552//extern "C"
553TALIGN16(const uint8, shuf2[16]) =
554  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
555
556// Offsets for source bytes 0 to 10
557//extern "C"
558TALIGN16(const uint8, shuf01[16]) =
559  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
560
561// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
562//extern "C"
563TALIGN16(const uint8, shuf11[16]) =
564  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
565
566// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
567//extern "C"
568TALIGN16(const uint8, shuf21[16]) =
569  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
570
571// Coefficients for source bytes 0 to 10
572//extern "C"
573TALIGN16(const uint8, madd01[16]) =
574  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
575
576// Coefficients for source bytes 10 to 21
577//extern "C"
578TALIGN16(const uint8, madd11[16]) =
579  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
580
581// Coefficients for source bytes 21 to 31
582//extern "C"
583TALIGN16(const uint8, madd21[16]) =
584  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
585
586// Coefficients for source bytes 21 to 31
587//extern "C"
588TALIGN16(const int16, round34[8]) =
589  { 2, 2, 2, 2, 2, 2, 2, 2 };
590
591//extern "C"
592TALIGN16(const uint8, shuf38a[16]) =
593  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
594
595//extern "C"
596TALIGN16(const uint8, shuf38b[16]) =
597  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
598
599// Arrange words 0,3,6 into 0,1,2
600//extern "C"
601TALIGN16(const uint8, shufac0[16]) =
602  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
603
604// Arrange words 0,3,6 into 3,4,5
605//extern "C"
606TALIGN16(const uint8, shufac3[16]) =
607  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
608
609// Scaling values for boxes of 3x3 and 2x3
610//extern "C"
611TALIGN16(const uint16, scaleac3[8]) =
612  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
613
614// Arrange first value for pixels 0,1,2,3,4,5
615//extern "C"
616TALIGN16(const uint8, shufab0[16]) =
617  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
618
619// Arrange second value for pixels 0,1,2,3,4,5
620//extern "C"
621TALIGN16(const uint8, shufab1[16]) =
622  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
623
624// Arrange third value for pixels 0,1,2,3,4,5
625//extern "C"
626TALIGN16(const uint8, shufab2[16]) =
627  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
628
629// Scaling values for boxes of 3x2 and 2x2
630//extern "C"
631TALIGN16(const uint16, scaleab2[8]) =
632  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
633#endif
634
635#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
636
637#define HAS_SCALEROWDOWN2_SSE2
638// Reads 32 pixels, throws half away and writes 16 pixels.
639// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
640__declspec(naked)
641static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
642                               uint8* dst_ptr, int dst_width) {
643  __asm {
644    mov        eax, [esp + 4]        // src_ptr
645                                     // src_stride ignored
646    mov        edx, [esp + 12]       // dst_ptr
647    mov        ecx, [esp + 16]       // dst_width
648    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
649    psrlw      xmm5, 8
650
651  wloop:
652    movdqa     xmm0, [eax]
653    movdqa     xmm1, [eax + 16]
654    lea        eax,  [eax + 32]
655    pand       xmm0, xmm5
656    pand       xmm1, xmm5
657    packuswb   xmm0, xmm1
658    movdqa     [edx], xmm0
659    lea        edx, [edx + 16]
660    sub        ecx, 16
661    ja         wloop
662
663    ret
664  }
665}
666// Blends 32x2 rectangle to 16x1.
667// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
668__declspec(naked)
669void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
670                           uint8* dst_ptr, int dst_width) {
671  __asm {
672    push       esi
673    mov        eax, [esp + 4 + 4]    // src_ptr
674    mov        esi, [esp + 4 + 8]    // src_stride
675    mov        edx, [esp + 4 + 12]   // dst_ptr
676    mov        ecx, [esp + 4 + 16]   // dst_width
677    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
678    psrlw      xmm5, 8
679
680  wloop:
681    movdqa     xmm0, [eax]
682    movdqa     xmm1, [eax + 16]
683    movdqa     xmm2, [eax + esi]
684    movdqa     xmm3, [eax + esi + 16]
685    lea        eax,  [eax + 32]
686    pavgb      xmm0, xmm2            // average rows
687    pavgb      xmm1, xmm3
688
689    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
690    psrlw      xmm0, 8
691    movdqa     xmm3, xmm1
692    psrlw      xmm1, 8
693    pand       xmm2, xmm5
694    pand       xmm3, xmm5
695    pavgw      xmm0, xmm2
696    pavgw      xmm1, xmm3
697    packuswb   xmm0, xmm1
698
699    movdqa     [edx], xmm0
700    lea        edx, [edx + 16]
701    sub        ecx, 16
702    ja         wloop
703
704    pop        esi
705    ret
706  }
707}
708
709#define HAS_SCALEROWDOWN4_SSE2
710// Point samples 32 pixels to 8 pixels.
711// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
712__declspec(naked)
713static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
714                               uint8* dst_ptr, int dst_width) {
715  __asm {
716    pushad
717    mov        esi, [esp + 32 + 4]   // src_ptr
718                                     // src_stride ignored
719    mov        edi, [esp + 32 + 12]  // dst_ptr
720    mov        ecx, [esp + 32 + 16]  // dst_width
721    pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff
722    psrld      xmm5, 24
723
724  wloop:
725    movdqa     xmm0, [esi]
726    movdqa     xmm1, [esi + 16]
727    lea        esi,  [esi + 32]
728    pand       xmm0, xmm5
729    pand       xmm1, xmm5
730    packuswb   xmm0, xmm1
731    packuswb   xmm0, xmm0
732    movq       qword ptr [edi], xmm0
733    lea        edi, [edi + 8]
734    sub        ecx, 8
735    ja         wloop
736
737    popad
738    ret
739  }
740}
741
742// Blends 32x4 rectangle to 8x1.
743// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
744__declspec(naked)
745static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
746                                  uint8* dst_ptr, int dst_width) {
747  __asm {
748    pushad
749    mov        esi, [esp + 32 + 4]   // src_ptr
750    mov        ebx, [esp + 32 + 8]   // src_stride
751    mov        edi, [esp + 32 + 12]  // dst_ptr
752    mov        ecx, [esp + 32 + 16]  // dst_width
753    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
754    psrlw      xmm7, 8
755    lea        edx, [ebx + ebx * 2]  // src_stride * 3
756
757  wloop:
758    movdqa     xmm0, [esi]
759    movdqa     xmm1, [esi + 16]
760    movdqa     xmm2, [esi + ebx]
761    movdqa     xmm3, [esi + ebx + 16]
762    pavgb      xmm0, xmm2            // average rows
763    pavgb      xmm1, xmm3
764    movdqa     xmm2, [esi + ebx * 2]
765    movdqa     xmm3, [esi + ebx * 2 + 16]
766    movdqa     xmm4, [esi + edx]
767    movdqa     xmm5, [esi + edx + 16]
768    lea        esi, [esi + 32]
769    pavgb      xmm2, xmm4
770    pavgb      xmm3, xmm5
771    pavgb      xmm0, xmm2
772    pavgb      xmm1, xmm3
773
774    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
775    psrlw      xmm0, 8
776    movdqa     xmm3, xmm1
777    psrlw      xmm1, 8
778    pand       xmm2, xmm7
779    pand       xmm3, xmm7
780    pavgw      xmm0, xmm2
781    pavgw      xmm1, xmm3
782    packuswb   xmm0, xmm1
783
784    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
785    psrlw      xmm0, 8
786    pand       xmm2, xmm7
787    pavgw      xmm0, xmm2
788    packuswb   xmm0, xmm0
789
790    movq       qword ptr [edi], xmm0
791    lea        edi, [edi + 8]
792    sub        ecx, 8
793    ja         wloop
794
795    popad
796    ret
797  }
798}
799
800#define HAS_SCALEROWDOWN8_SSE2
801// Point samples 32 pixels to 4 pixels.
802// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
803__declspec(naked)
804static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
805                               uint8* dst_ptr, int dst_width) {
806  __asm {
807    pushad
808    mov        esi, [esp + 32 + 4]   // src_ptr
809                                     // src_stride ignored
810    mov        edi, [esp + 32 + 12]  // dst_ptr
811    mov        ecx, [esp + 32 + 16]  // dst_width
812    pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes
813    psrlq      xmm5, 56
814
815  wloop:
816    movdqa     xmm0, [esi]
817    movdqa     xmm1, [esi + 16]
818    lea        esi,  [esi + 32]
819    pand       xmm0, xmm5
820    pand       xmm1, xmm5
821    packuswb   xmm0, xmm1  // 32->16
822    packuswb   xmm0, xmm0  // 16->8
823    packuswb   xmm0, xmm0  // 8->4
824    movd       dword ptr [edi], xmm0
825    lea        edi, [edi + 4]
826    sub        ecx, 4
827    ja         wloop
828
829    popad
830    ret
831  }
832}
833
834// Blends 32x8 rectangle to 4x1.
835// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
836__declspec(naked)
837static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
838                                  uint8* dst_ptr, int dst_width) {
839  __asm {
840    pushad
841    mov        esi, [esp + 32 + 4]   // src_ptr
842    mov        ebx, [esp + 32 + 8]   // src_stride
843    mov        edi, [esp + 32 + 12]  // dst_ptr
844    mov        ecx, [esp + 32 + 16]  // dst_width
845    lea        edx, [ebx + ebx * 2]  // src_stride * 3
846    pxor       xmm7, xmm7
847
848  wloop:
849    movdqa     xmm0, [esi]           // average 8 rows to 1
850    movdqa     xmm1, [esi + 16]
851    movdqa     xmm2, [esi + ebx]
852    movdqa     xmm3, [esi + ebx + 16]
853    pavgb      xmm0, xmm2
854    pavgb      xmm1, xmm3
855    movdqa     xmm2, [esi + ebx * 2]
856    movdqa     xmm3, [esi + ebx * 2 + 16]
857    movdqa     xmm4, [esi + edx]
858    movdqa     xmm5, [esi + edx + 16]
859    lea        ebp, [esi + ebx * 4]
860    lea        esi, [esi + 32]
861    pavgb      xmm2, xmm4
862    pavgb      xmm3, xmm5
863    pavgb      xmm0, xmm2
864    pavgb      xmm1, xmm3
865
866    movdqa     xmm2, [ebp]
867    movdqa     xmm3, [ebp + 16]
868    movdqa     xmm4, [ebp + ebx]
869    movdqa     xmm5, [ebp + ebx + 16]
870    pavgb      xmm2, xmm4
871    pavgb      xmm3, xmm5
872    movdqa     xmm4, [ebp + ebx * 2]
873    movdqa     xmm5, [ebp + ebx * 2 + 16]
874    movdqa     xmm6, [ebp + edx]
875    pavgb      xmm4, xmm6
876    movdqa     xmm6, [ebp + edx + 16]
877    pavgb      xmm5, xmm6
878    pavgb      xmm2, xmm4
879    pavgb      xmm3, xmm5
880    pavgb      xmm0, xmm2
881    pavgb      xmm1, xmm3
882
883    psadbw     xmm0, xmm7            // average 32 pixels to 4
884    psadbw     xmm1, xmm7
885    pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
886    pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
887    por        xmm0, xmm1            //      -> 3201
888    psrlw      xmm0, 3
889    packuswb   xmm0, xmm0
890    packuswb   xmm0, xmm0
891    movd       dword ptr [edi], xmm0
892
893    lea        edi, [edi + 4]
894    sub        ecx, 4
895    ja         wloop
896
897    popad
898    ret
899  }
900}
901
902#define HAS_SCALEROWDOWN34_SSSE3
903// Point samples 32 pixels to 24 pixels.
904// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
905// Then shuffled to do the scaling.
906
907// Note that movdqa+palign may be better than movdqu.
908// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
909__declspec(naked)
910static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
911                                 uint8* dst_ptr, int dst_width) {
912  __asm {
913    pushad
914    mov        esi, [esp + 32 + 4]   // src_ptr
915                                     // src_stride ignored
916    mov        edi, [esp + 32 + 12]  // dst_ptr
917    mov        ecx, [esp + 32 + 16]  // dst_width
918    movdqa     xmm3, _shuf0
919    movdqa     xmm4, _shuf1
920    movdqa     xmm5, _shuf2
921
922  wloop:
923    movdqa     xmm0, [esi]
924    movdqa     xmm1, [esi + 16]
925    lea        esi,  [esi + 32]
926    movdqa     xmm2, xmm1
927    palignr    xmm1, xmm0, 8
928    pshufb     xmm0, xmm3
929    pshufb     xmm1, xmm4
930    pshufb     xmm2, xmm5
931    movq       qword ptr [edi], xmm0
932    movq       qword ptr [edi + 8], xmm1
933    movq       qword ptr [edi + 16], xmm2
934    lea        edi, [edi + 24]
935    sub        ecx, 24
936    ja         wloop
937
938    popad
939    ret
940  }
941}
942
943// Blends 32x2 rectangle to 24x1
944// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
945// Then shuffled to do the scaling.
946
947// Register usage:
948// xmm0 src_row 0
949// xmm1 src_row 1
950// xmm2 shuf 0
951// xmm3 shuf 1
952// xmm4 shuf 2
953// xmm5 madd 0
954// xmm6 madd 1
955// xmm7 round34
956
957// Note that movdqa+palign may be better than movdqu.
958// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
959__declspec(naked)
960static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
961                                       uint8* dst_ptr, int dst_width) {
962  __asm {
963    pushad
964    mov        esi, [esp + 32 + 4]   // src_ptr
965    mov        ebx, [esp + 32 + 8]   // src_stride
966    mov        edi, [esp + 32 + 12]  // dst_ptr
967    mov        ecx, [esp + 32 + 16]  // dst_width
968    movdqa     xmm2, _shuf01
969    movdqa     xmm3, _shuf11
970    movdqa     xmm4, _shuf21
971    movdqa     xmm5, _madd01
972    movdqa     xmm6, _madd11
973    movdqa     xmm7, _round34
974
975  wloop:
976    movdqa     xmm0, [esi]           // pixels 0..7
977    movdqa     xmm1, [esi+ebx]
978    pavgb      xmm0, xmm1
979    pshufb     xmm0, xmm2
980    pmaddubsw  xmm0, xmm5
981    paddsw     xmm0, xmm7
982    psrlw      xmm0, 2
983    packuswb   xmm0, xmm0
984    movq       qword ptr [edi], xmm0
985    movdqu     xmm0, [esi+8]         // pixels 8..15
986    movdqu     xmm1, [esi+ebx+8]
987    pavgb      xmm0, xmm1
988    pshufb     xmm0, xmm3
989    pmaddubsw  xmm0, xmm6
990    paddsw     xmm0, xmm7
991    psrlw      xmm0, 2
992    packuswb   xmm0, xmm0
993    movq       qword ptr [edi+8], xmm0
994    movdqa     xmm0, [esi+16]        // pixels 16..23
995    movdqa     xmm1, [esi+ebx+16]
996    lea        esi, [esi+32]
997    pavgb      xmm0, xmm1
998    pshufb     xmm0, xmm4
999    movdqa     xmm1, _madd21
1000    pmaddubsw  xmm0, xmm1
1001    paddsw     xmm0, xmm7
1002    psrlw      xmm0, 2
1003    packuswb   xmm0, xmm0
1004    movq       qword ptr [edi+16], xmm0
1005    lea        edi, [edi+24]
1006    sub        ecx, 24
1007    ja         wloop
1008
1009    popad
1010    ret
1011  }
1012}
1013
1014// Note that movdqa+palign may be better than movdqu.
1015// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
1016__declspec(naked)
1017static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
1018                                       uint8* dst_ptr, int dst_width) {
1019  __asm {
1020    pushad
1021    mov        esi, [esp + 32 + 4]   // src_ptr
1022    mov        ebx, [esp + 32 + 8]   // src_stride
1023    mov        edi, [esp + 32 + 12]  // dst_ptr
1024    mov        ecx, [esp + 32 + 16]  // dst_width
1025    movdqa     xmm2, _shuf01
1026    movdqa     xmm3, _shuf11
1027    movdqa     xmm4, _shuf21
1028    movdqa     xmm5, _madd01
1029    movdqa     xmm6, _madd11
1030    movdqa     xmm7, _round34
1031
1032  wloop:
1033    movdqa     xmm0, [esi]           // pixels 0..7
1034    movdqa     xmm1, [esi+ebx]
1035    pavgb      xmm1, xmm0
1036    pavgb      xmm0, xmm1
1037    pshufb     xmm0, xmm2
1038    pmaddubsw  xmm0, xmm5
1039    paddsw     xmm0, xmm7
1040    psrlw      xmm0, 2
1041    packuswb   xmm0, xmm0
1042    movq       qword ptr [edi], xmm0
1043    movdqu     xmm0, [esi+8]         // pixels 8..15
1044    movdqu     xmm1, [esi+ebx+8]
1045    pavgb      xmm1, xmm0
1046    pavgb      xmm0, xmm1
1047    pshufb     xmm0, xmm3
1048    pmaddubsw  xmm0, xmm6
1049    paddsw     xmm0, xmm7
1050    psrlw      xmm0, 2
1051    packuswb   xmm0, xmm0
1052    movq       qword ptr [edi+8], xmm0
1053    movdqa     xmm0, [esi+16]        // pixels 16..23
1054    movdqa     xmm1, [esi+ebx+16]
1055    lea        esi, [esi+32]
1056    pavgb      xmm1, xmm0
1057    pavgb      xmm0, xmm1
1058    pshufb     xmm0, xmm4
1059    movdqa     xmm1, _madd21
1060    pmaddubsw  xmm0, xmm1
1061    paddsw     xmm0, xmm7
1062    psrlw      xmm0, 2
1063    packuswb   xmm0, xmm0
1064    movq       qword ptr [edi+16], xmm0
1065    lea        edi, [edi+24]
1066    sub        ecx, 24
1067    ja         wloop
1068
1069    popad
1070    ret
1071  }
1072}
1073
1074#define HAS_SCALEROWDOWN38_SSSE3
1075// 3/8 point sampler
1076
1077// Scale 32 pixels to 12
1078__declspec(naked)
1079static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
1080                                 uint8* dst_ptr, int dst_width) {
1081  __asm {
1082    pushad
1083    mov        esi, [esp + 32 + 4]   // src_ptr
1084    mov        edx, [esp + 32 + 8]   // src_stride
1085    mov        edi, [esp + 32 + 12]  // dst_ptr
1086    mov        ecx, [esp + 32 + 16]  // dst_width
1087    movdqa     xmm4, _shuf38a
1088    movdqa     xmm5, _shuf38b
1089
1090  xloop:
1091    movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
1092    movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
1093    lea        esi, [esi + 32]
1094    pshufb     xmm0, xmm4
1095    pshufb     xmm1, xmm5
1096    paddusb    xmm0, xmm1
1097
1098    movq       qword ptr [edi], xmm0 // write 12 pixels
1099    movhlps    xmm1, xmm0
1100    movd       [edi + 8], xmm1
1101    lea        edi, [edi + 12]
1102    sub        ecx, 12
1103    ja         xloop
1104
1105    popad
1106    ret
1107  }
1108}
1109
1110// Scale 16x3 pixels to 6x1 with interpolation
1111__declspec(naked)
1112static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
1113                                       uint8* dst_ptr, int dst_width) {
1114  __asm {
1115    pushad
1116    mov        esi, [esp + 32 + 4]   // src_ptr
1117    mov        edx, [esp + 32 + 8]   // src_stride
1118    mov        edi, [esp + 32 + 12]  // dst_ptr
1119    mov        ecx, [esp + 32 + 16]  // dst_width
1120    movdqa     xmm4, _shufac0
1121    movdqa     xmm5, _shufac3
1122    movdqa     xmm6, _scaleac3
1123    pxor       xmm7, xmm7
1124
1125  xloop:
1126    movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
1127    movdqa     xmm2, [esi + edx]
1128    movhlps    xmm1, xmm0
1129    movhlps    xmm3, xmm2
1130    punpcklbw  xmm0, xmm7
1131    punpcklbw  xmm1, xmm7
1132    punpcklbw  xmm2, xmm7
1133    punpcklbw  xmm3, xmm7
1134    paddusw    xmm0, xmm2
1135    paddusw    xmm1, xmm3
1136    movdqa     xmm2, [esi + edx * 2]
1137    lea        esi, [esi + 16]
1138    movhlps    xmm3, xmm2
1139    punpcklbw  xmm2, xmm7
1140    punpcklbw  xmm3, xmm7
1141    paddusw    xmm0, xmm2
1142    paddusw    xmm1, xmm3
1143
1144    movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
1145    psrldq     xmm0, 2
1146    paddusw    xmm2, xmm0
1147    psrldq     xmm0, 2
1148    paddusw    xmm2, xmm0
1149    pshufb     xmm2, xmm4
1150
1151    movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
1152    psrldq     xmm1, 2
1153    paddusw    xmm3, xmm1
1154    psrldq     xmm1, 2
1155    paddusw    xmm3, xmm1
1156    pshufb     xmm3, xmm5
1157    paddusw    xmm2, xmm3
1158
1159    pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
1160    packuswb   xmm2, xmm2
1161
1162    movd       [edi], xmm2           // write 6 pixels
1163    pextrw     eax, xmm2, 2
1164    mov        [edi + 4], ax
1165    lea        edi, [edi + 6]
1166    sub        ecx, 6
1167    ja         xloop
1168
1169    popad
1170    ret
1171  }
1172}
1173
1174// Scale 16x2 pixels to 6x1 with interpolation
1175__declspec(naked)
1176static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
1177                                       uint8* dst_ptr, int dst_width) {
1178  __asm {
1179    pushad
1180    mov        esi, [esp + 32 + 4]   // src_ptr
1181    mov        edx, [esp + 32 + 8]   // src_stride
1182    mov        edi, [esp + 32 + 12]  // dst_ptr
1183    mov        ecx, [esp + 32 + 16]  // dst_width
1184    movdqa     xmm4, _shufab0
1185    movdqa     xmm5, _shufab1
1186    movdqa     xmm6, _shufab2
1187    movdqa     xmm7, _scaleab2
1188
1189  xloop:
1190    movdqa     xmm2, [esi]           // average 2 rows into xmm2
1191    pavgb      xmm2, [esi + edx]
1192    lea        esi, [esi + 16]
1193
1194    movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
1195    pshufb     xmm0, xmm4
1196    movdqa     xmm1, xmm2
1197    pshufb     xmm1, xmm5
1198    paddusw    xmm0, xmm1
1199    pshufb     xmm2, xmm6
1200    paddusw    xmm0, xmm2
1201
1202    pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
1203    packuswb   xmm0, xmm0
1204
1205    movd       [edi], xmm0           // write 6 pixels
1206    pextrw     eax, xmm0, 2
1207    mov        [edi + 4], ax
1208    lea        edi, [edi + 6]
1209    sub        ecx, 6
1210    ja         xloop
1211
1212    popad
1213    ret
1214  }
1215}
1216
1217#define HAS_SCALEADDROWS_SSE2
1218
1219// Reads 8xN bytes and produces 16 shorts at a time.
1220__declspec(naked)
1221static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
1222                              uint16* dst_ptr, int src_width,
1223                              int src_height) {
1224  __asm {
1225    pushad
1226    mov        esi, [esp + 32 + 4]   // src_ptr
1227    mov        edx, [esp + 32 + 8]   // src_stride
1228    mov        edi, [esp + 32 + 12]  // dst_ptr
1229    mov        ecx, [esp + 32 + 16]  // dst_width
1230    mov        ebx, [esp + 32 + 20]  // height
1231    pxor       xmm5, xmm5
1232    dec        ebx
1233
1234  xloop:
1235    // first row
1236    movdqa     xmm2, [esi]
1237    lea        eax, [esi + edx]
1238    movhlps    xmm3, xmm2
1239    mov        ebp, ebx
1240    punpcklbw  xmm2, xmm5
1241    punpcklbw  xmm3, xmm5
1242
1243    // sum remaining rows
1244  yloop:
1245    movdqa     xmm0, [eax]       // read 16 pixels
1246    lea        eax, [eax + edx]  // advance to next row
1247    movhlps    xmm1, xmm0
1248    punpcklbw  xmm0, xmm5
1249    punpcklbw  xmm1, xmm5
1250    paddusw    xmm2, xmm0        // sum 16 words
1251    paddusw    xmm3, xmm1
1252    sub        ebp, 1
1253    ja         yloop
1254
1255    movdqa     [edi], xmm2
1256    movdqa     [edi + 16], xmm3
1257    lea        edi, [edi + 32]
1258    lea        esi, [esi + 16]
1259
1260    sub        ecx, 16
1261    ja         xloop
1262
1263    popad
1264    ret
1265  }
1266}
1267
1268// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
1269#define HAS_SCALEFILTERROWS_SSE2
1270__declspec(naked)
1271static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
1272                                 int src_stride, int dst_width,
1273                                 int source_y_fraction) {
1274  __asm {
1275    push       esi
1276    push       edi
1277    mov        edi, [esp + 8 + 4]   // dst_ptr
1278    mov        esi, [esp + 8 + 8]   // src_ptr
1279    mov        edx, [esp + 8 + 12]  // src_stride
1280    mov        ecx, [esp + 8 + 16]  // dst_width
1281    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
1282    cmp        eax, 0
1283    je         xloop1
1284    cmp        eax, 128
1285    je         xloop2
1286
1287    movd       xmm6, eax            // xmm6 = y fraction
1288    punpcklwd  xmm6, xmm6
1289    pshufd     xmm6, xmm6, 0
1290    neg        eax                  // xmm5 = 256 - y fraction
1291    add        eax, 256
1292    movd       xmm5, eax
1293    punpcklwd  xmm5, xmm5
1294    pshufd     xmm5, xmm5, 0
1295    pxor       xmm7, xmm7
1296
1297  xloop:
1298    movdqa     xmm0, [esi]
1299    movdqa     xmm2, [esi + edx]
1300    lea        esi, [esi + 16]
1301    movdqa     xmm1, xmm0
1302    movdqa     xmm3, xmm2
1303    punpcklbw  xmm0, xmm7
1304    punpcklbw  xmm2, xmm7
1305    punpckhbw  xmm1, xmm7
1306    punpckhbw  xmm3, xmm7
1307    pmullw     xmm0, xmm5           // scale row 0
1308    pmullw     xmm1, xmm5
1309    pmullw     xmm2, xmm6           // scale row 1
1310    pmullw     xmm3, xmm6
1311    paddusw    xmm0, xmm2           // sum rows
1312    paddusw    xmm1, xmm3
1313    psrlw      xmm0, 8
1314    psrlw      xmm1, 8
1315    packuswb   xmm0, xmm1
1316    movdqa     [edi], xmm0
1317    lea        edi, [edi + 16]
1318    sub        ecx, 16
1319    ja         xloop
1320
1321    mov        al, [edi - 1]
1322    mov        [edi], al
1323    pop        edi
1324    pop        esi
1325    ret
1326
1327  xloop1:
1328    movdqa     xmm0, [esi]
1329    lea        esi, [esi + 16]
1330    movdqa     [edi], xmm0
1331    lea        edi, [edi + 16]
1332    sub        ecx, 16
1333    ja         xloop1
1334
1335    mov        al, [edi - 1]
1336    mov        [edi], al
1337    pop        edi
1338    pop        esi
1339    ret
1340
1341  xloop2:
1342    movdqa     xmm0, [esi]
1343    movdqa     xmm2, [esi + edx]
1344    lea        esi, [esi + 16]
1345    pavgb      xmm0, xmm2
1346    movdqa     [edi], xmm0
1347    lea        edi, [edi + 16]
1348    sub        ecx, 16
1349    ja         xloop2
1350
1351    mov        al, [edi - 1]
1352    mov        [edi], al
1353    pop        edi
1354    pop        esi
1355    ret
1356  }
1357}
1358
1359// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
1360#define HAS_SCALEFILTERROWS_SSSE3
1361__declspec(naked)
1362static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1363                                  int src_stride, int dst_width,
1364                                  int source_y_fraction) {
1365  __asm {
1366    push       esi
1367    push       edi
1368    mov        edi, [esp + 8 + 4]   // dst_ptr
1369    mov        esi, [esp + 8 + 8]   // src_ptr
1370    mov        edx, [esp + 8 + 12]  // src_stride
1371    mov        ecx, [esp + 8 + 16]  // dst_width
1372    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
1373    cmp        eax, 0
1374    je         xloop1
1375    cmp        eax, 128
1376    je         xloop2
1377
1378    shr        eax, 1
1379    mov        ah,al
1380    neg        al
1381    add        al, 128
1382    movd       xmm5, eax
1383    punpcklwd  xmm5, xmm5
1384    pshufd     xmm5, xmm5, 0
1385
1386  xloop:
1387    movdqa     xmm0, [esi]
1388    movdqa     xmm2, [esi + edx]
1389    lea        esi, [esi + 16]
1390    movdqa     xmm1, xmm0
1391    punpcklbw  xmm0, xmm2
1392    punpckhbw  xmm1, xmm2
1393    pmaddubsw  xmm0, xmm5
1394    pmaddubsw  xmm1, xmm5
1395    psrlw      xmm0, 7
1396    psrlw      xmm1, 7
1397    packuswb   xmm0, xmm1
1398    movdqa     [edi], xmm0
1399    lea        edi, [edi + 16]
1400    sub        ecx, 16
1401    ja         xloop
1402
1403    mov        al, [edi - 1]
1404    mov        [edi], al
1405    pop        edi
1406    pop        esi
1407    ret
1408
1409  xloop1:
1410    movdqa     xmm0, [esi]
1411    lea        esi, [esi + 16]
1412    movdqa     [edi], xmm0
1413    lea        edi, [edi + 16]
1414    sub        ecx, 16
1415    ja         xloop1
1416
1417    mov        al, [edi - 1]
1418    mov        [edi], al
1419    pop        edi
1420    pop        esi
1421    ret
1422
1423  xloop2:
1424    movdqa     xmm0, [esi]
1425    movdqa     xmm2, [esi + edx]
1426    lea        esi, [esi + 16]
1427    pavgb      xmm0, xmm2
1428    movdqa     [edi], xmm0
1429    lea        edi, [edi + 16]
1430    sub        ecx, 16
1431    ja         xloop2
1432
1433    mov        al, [edi - 1]
1434    mov        [edi], al
1435    pop        edi
1436    pop        esi
1437    ret
1438
1439  }
1440}
1441
1442// Note that movdqa+palign may be better than movdqu.
1443// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
1444__declspec(naked)
1445static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1446                                    int dst_width) {
1447  __asm {
1448    mov        edx, [esp + 4]    // dst_ptr
1449    mov        eax, [esp + 8]    // src_ptr
1450    mov        ecx, [esp + 12]   // dst_width
1451    movdqa     xmm1, _round34
1452    movdqa     xmm2, _shuf01
1453    movdqa     xmm3, _shuf11
1454    movdqa     xmm4, _shuf21
1455    movdqa     xmm5, _madd01
1456    movdqa     xmm6, _madd11
1457    movdqa     xmm7, _madd21
1458
1459  wloop:
1460    movdqa     xmm0, [eax]           // pixels 0..7
1461    pshufb     xmm0, xmm2
1462    pmaddubsw  xmm0, xmm5
1463    paddsw     xmm0, xmm1
1464    psrlw      xmm0, 2
1465    packuswb   xmm0, xmm0
1466    movq       qword ptr [edx], xmm0
1467    movdqu     xmm0, [eax+8]         // pixels 8..15
1468    pshufb     xmm0, xmm3
1469    pmaddubsw  xmm0, xmm6
1470    paddsw     xmm0, xmm1
1471    psrlw      xmm0, 2
1472    packuswb   xmm0, xmm0
1473    movq       qword ptr [edx+8], xmm0
1474    movdqa     xmm0, [eax+16]        // pixels 16..23
1475    lea        eax, [eax+32]
1476    pshufb     xmm0, xmm4
1477    pmaddubsw  xmm0, xmm7
1478    paddsw     xmm0, xmm1
1479    psrlw      xmm0, 2
1480    packuswb   xmm0, xmm0
1481    movq       qword ptr [edx+16], xmm0
1482    lea        edx, [edx+24]
1483    sub        ecx, 24
1484    ja         wloop
1485    ret
1486  }
1487}
1488
1489#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
1490
1491// GCC versions of row functions are verbatim conversions from Visual C.
1492// Generated using gcc disassembly on Visual C object file:
1493// objdump -D yuvscaler.obj >yuvscaler.txt
1494#define HAS_SCALEROWDOWN2_SSE2
1495static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
1496                               uint8* dst_ptr, int dst_width) {
1497  asm volatile (
1498  "pcmpeqb    %%xmm5,%%xmm5                    \n"
1499  "psrlw      $0x8,%%xmm5                      \n"
1500"1:"
1501  "movdqa     (%0),%%xmm0                      \n"
1502  "movdqa     0x10(%0),%%xmm1                  \n"
1503  "lea        0x20(%0),%0                      \n"
1504  "pand       %%xmm5,%%xmm0                    \n"
1505  "pand       %%xmm5,%%xmm1                    \n"
1506  "packuswb   %%xmm1,%%xmm0                    \n"
1507  "movdqa     %%xmm0,(%1)                      \n"
1508  "lea        0x10(%1),%1                      \n"
1509  "sub        $0x10,%2                         \n"
1510  "ja         1b                               \n"
1511  : "+r"(src_ptr),    // %0
1512    "+r"(dst_ptr),    // %1
1513    "+r"(dst_width)   // %2
1514  :
1515  : "memory", "cc"
1516);
1517}
1518
1519static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
1520                                  uint8* dst_ptr, int dst_width) {
1521  asm volatile (
1522  "pcmpeqb    %%xmm5,%%xmm5                    \n"
1523  "psrlw      $0x8,%%xmm5                      \n"
1524"1:"
1525  "movdqa     (%0),%%xmm0                      \n"
1526  "movdqa     0x10(%0),%%xmm1                  \n"
1527  "movdqa     (%0,%3,1),%%xmm2                 \n"
1528  "movdqa     0x10(%0,%3,1),%%xmm3             \n"
1529  "lea        0x20(%0),%0                      \n"
1530  "pavgb      %%xmm2,%%xmm0                    \n"
1531  "pavgb      %%xmm3,%%xmm1                    \n"
1532  "movdqa     %%xmm0,%%xmm2                    \n"
1533  "psrlw      $0x8,%%xmm0                      \n"
1534  "movdqa     %%xmm1,%%xmm3                    \n"
1535  "psrlw      $0x8,%%xmm1                      \n"
1536  "pand       %%xmm5,%%xmm2                    \n"
1537  "pand       %%xmm5,%%xmm3                    \n"
1538  "pavgw      %%xmm2,%%xmm0                    \n"
1539  "pavgw      %%xmm3,%%xmm1                    \n"
1540  "packuswb   %%xmm1,%%xmm0                    \n"
1541  "movdqa     %%xmm0,(%1)                      \n"
1542  "lea        0x10(%1),%1                      \n"
1543  "sub        $0x10,%2                         \n"
1544  "ja         1b                               \n"
1545  : "+r"(src_ptr),    // %0
1546    "+r"(dst_ptr),    // %1
1547    "+r"(dst_width)   // %2
1548  : "r"((intptr_t)(src_stride))   // %3
1549  : "memory", "cc"
1550);
1551}
1552
1553#define HAS_SCALEROWDOWN4_SSE2
1554static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
1555                               uint8* dst_ptr, int dst_width) {
1556  asm volatile (
1557  "pcmpeqb    %%xmm5,%%xmm5                    \n"
1558  "psrld      $0x18,%%xmm5                     \n"
1559"1:"
1560  "movdqa     (%0),%%xmm0                      \n"
1561  "movdqa     0x10(%0),%%xmm1                  \n"
1562  "lea        0x20(%0),%0                      \n"
1563  "pand       %%xmm5,%%xmm0                    \n"
1564  "pand       %%xmm5,%%xmm1                    \n"
1565  "packuswb   %%xmm1,%%xmm0                    \n"
1566  "packuswb   %%xmm0,%%xmm0                    \n"
1567  "movq       %%xmm0,(%1)                      \n"
1568  "lea        0x8(%1),%1                       \n"
1569  "sub        $0x8,%2                          \n"
1570  "ja         1b                               \n"
1571  : "+r"(src_ptr),    // %0
1572    "+r"(dst_ptr),    // %1
1573    "+r"(dst_width)   // %2
1574  :
1575  : "memory", "cc"
1576);
1577}
1578
1579static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
1580                                  uint8* dst_ptr, int dst_width) {
1581  intptr_t temp = 0;
1582  asm volatile (
1583  "pcmpeqb    %%xmm7,%%xmm7                    \n"
1584  "psrlw      $0x8,%%xmm7                      \n"
1585  "lea        (%4,%4,2),%3                     \n"
1586"1:"
1587  "movdqa     (%0),%%xmm0                      \n"
1588  "movdqa     0x10(%0),%%xmm1                  \n"
1589  "movdqa     (%0,%4,1),%%xmm2                 \n"
1590  "movdqa     0x10(%0,%4,1),%%xmm3             \n"
1591  "pavgb      %%xmm2,%%xmm0                    \n"
1592  "pavgb      %%xmm3,%%xmm1                    \n"
1593  "movdqa     (%0,%4,2),%%xmm2                 \n"
1594  "movdqa     0x10(%0,%4,2),%%xmm3             \n"
1595  "movdqa     (%0,%3,1),%%xmm4                 \n"
1596  "movdqa     0x10(%0,%3,1),%%xmm5             \n"
1597  "lea        0x20(%0),%0                      \n"
1598  "pavgb      %%xmm4,%%xmm2                    \n"
1599  "pavgb      %%xmm2,%%xmm0                    \n"
1600  "pavgb      %%xmm5,%%xmm3                    \n"
1601  "pavgb      %%xmm3,%%xmm1                    \n"
1602  "movdqa     %%xmm0,%%xmm2                    \n"
1603  "psrlw      $0x8,%%xmm0                      \n"
1604  "movdqa     %%xmm1,%%xmm3                    \n"
1605  "psrlw      $0x8,%%xmm1                      \n"
1606  "pand       %%xmm7,%%xmm2                    \n"
1607  "pand       %%xmm7,%%xmm3                    \n"
1608  "pavgw      %%xmm2,%%xmm0                    \n"
1609  "pavgw      %%xmm3,%%xmm1                    \n"
1610  "packuswb   %%xmm1,%%xmm0                    \n"
1611  "movdqa     %%xmm0,%%xmm2                    \n"
1612  "psrlw      $0x8,%%xmm0                      \n"
1613  "pand       %%xmm7,%%xmm2                    \n"
1614  "pavgw      %%xmm2,%%xmm0                    \n"
1615  "packuswb   %%xmm0,%%xmm0                    \n"
1616  "movq       %%xmm0,(%1)                      \n"
1617  "lea        0x8(%1),%1                       \n"
1618  "sub        $0x8,%2                          \n"
1619  "ja         1b                               \n"
1620  : "+r"(src_ptr),     // %0
1621    "+r"(dst_ptr),     // %1
1622    "+r"(dst_width),   // %2
1623    "+r"(temp)         // %3
1624  : "r"((intptr_t)(src_stride))    // %4
1625  : "memory", "cc"
1626#if defined(__x86_64__)
1627    , "xmm6", "xmm7"
1628#endif
1629);
1630}
1631
1632#define HAS_SCALEROWDOWN8_SSE2
1633static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
1634                               uint8* dst_ptr, int dst_width) {
1635  asm volatile (
1636  "pcmpeqb    %%xmm5,%%xmm5                    \n"
1637  "psrlq      $0x38,%%xmm5                     \n"
1638"1:"
1639  "movdqa     (%0),%%xmm0                      \n"
1640  "movdqa     0x10(%0),%%xmm1                  \n"
1641  "lea        0x20(%0),%0                      \n"
1642  "pand       %%xmm5,%%xmm0                    \n"
1643  "pand       %%xmm5,%%xmm1                    \n"
1644  "packuswb   %%xmm1,%%xmm0                    \n"
1645  "packuswb   %%xmm0,%%xmm0                    \n"
1646  "packuswb   %%xmm0,%%xmm0                    \n"
1647  "movd       %%xmm0,(%1)                      \n"
1648  "lea        0x4(%1),%1                       \n"
1649  "sub        $0x4,%2                          \n"
1650  "ja         1b                               \n"
1651  : "+r"(src_ptr),    // %0
1652    "+r"(dst_ptr),    // %1
1653    "+r"(dst_width)   // %2
1654  :
1655  : "memory", "cc"
1656);
1657}
1658
1659#if defined(__i386__)
1660void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
1661                                      uint8* dst_ptr, int dst_width);
1662  asm(
1663    DECLARE_FUNCTION(ScaleRowDown8Int_SSE2)
1664    "pusha                                     \n"
1665    "mov    0x24(%esp),%esi                    \n"
1666    "mov    0x28(%esp),%ebx                    \n"
1667    "mov    0x2c(%esp),%edi                    \n"
1668    "mov    0x30(%esp),%ecx                    \n"
1669    "lea    (%ebx,%ebx,2),%edx                 \n"
1670    "pxor   %xmm7,%xmm7                        \n"
1671
1672"1:"
1673    "movdqa (%esi),%xmm0                       \n"
1674    "movdqa 0x10(%esi),%xmm1                   \n"
1675    "movdqa (%esi,%ebx,1),%xmm2                \n"
1676    "movdqa 0x10(%esi,%ebx,1),%xmm3            \n"
1677    "pavgb  %xmm2,%xmm0                        \n"
1678    "pavgb  %xmm3,%xmm1                        \n"
1679    "movdqa (%esi,%ebx,2),%xmm2                \n"
1680    "movdqa 0x10(%esi,%ebx,2),%xmm3            \n"
1681    "movdqa (%esi,%edx,1),%xmm4                \n"
1682    "movdqa 0x10(%esi,%edx,1),%xmm5            \n"
1683    "lea    (%esi,%ebx,4),%ebp                 \n"
1684    "lea    0x20(%esi),%esi                    \n"
1685    "pavgb  %xmm4,%xmm2                        \n"
1686    "pavgb  %xmm5,%xmm3                        \n"
1687    "pavgb  %xmm2,%xmm0                        \n"
1688    "pavgb  %xmm3,%xmm1                        \n"
1689    "movdqa 0x0(%ebp),%xmm2                    \n"
1690    "movdqa 0x10(%ebp),%xmm3                   \n"
1691    "movdqa 0x0(%ebp,%ebx,1),%xmm4             \n"
1692    "movdqa 0x10(%ebp,%ebx,1),%xmm5            \n"
1693    "pavgb  %xmm4,%xmm2                        \n"
1694    "pavgb  %xmm5,%xmm3                        \n"
1695    "movdqa 0x0(%ebp,%ebx,2),%xmm4             \n"
1696    "movdqa 0x10(%ebp,%ebx,2),%xmm5            \n"
1697    "movdqa 0x0(%ebp,%edx,1),%xmm6             \n"
1698    "pavgb  %xmm6,%xmm4                        \n"
1699    "movdqa 0x10(%ebp,%edx,1),%xmm6            \n"
1700    "pavgb  %xmm6,%xmm5                        \n"
1701    "pavgb  %xmm4,%xmm2                        \n"
1702    "pavgb  %xmm5,%xmm3                        \n"
1703    "pavgb  %xmm2,%xmm0                        \n"
1704    "pavgb  %xmm3,%xmm1                        \n"
1705    "psadbw %xmm7,%xmm0                        \n"
1706    "psadbw %xmm7,%xmm1                        \n"
1707    "pshufd $0xd8,%xmm0,%xmm0                  \n"
1708    "pshufd $0x8d,%xmm1,%xmm1                  \n"
1709    "por    %xmm1,%xmm0                        \n"
1710    "psrlw  $0x3,%xmm0                         \n"
1711    "packuswb %xmm0,%xmm0                      \n"
1712    "packuswb %xmm0,%xmm0                      \n"
1713    "movd   %xmm0,(%edi)                       \n"
1714    "lea    0x4(%edi),%edi                     \n"
1715    "sub    $0x4,%ecx                          \n"
1716    "ja     1b                                 \n"
1717    "popa                                      \n"
1718    "ret                                       \n"
1719);
1720
1721// fpic is used for magiccam plugin
1722#if !defined(__PIC__)
1723#define HAS_SCALEROWDOWN34_SSSE3
1724void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
1725                                     uint8* dst_ptr, int dst_width);
1726  asm(
1727    DECLARE_FUNCTION(ScaleRowDown34_SSSE3)
1728    "pusha                                     \n"
1729    "mov    0x24(%esp),%esi                    \n"
1730    "mov    0x2c(%esp),%edi                    \n"
1731    "mov    0x30(%esp),%ecx                    \n"
1732    "movdqa _shuf0,%xmm3                       \n"
1733    "movdqa _shuf1,%xmm4                       \n"
1734    "movdqa _shuf2,%xmm5                       \n"
1735
1736"1:"
1737    "movdqa (%esi),%xmm0                       \n"
1738    "movdqa 0x10(%esi),%xmm2                   \n"
1739    "lea    0x20(%esi),%esi                    \n"
1740    "movdqa %xmm2,%xmm1                        \n"
1741    "palignr $0x8,%xmm0,%xmm1                  \n"
1742    "pshufb %xmm3,%xmm0                        \n"
1743    "pshufb %xmm4,%xmm1                        \n"
1744    "pshufb %xmm5,%xmm2                        \n"
1745    "movq   %xmm0,(%edi)                       \n"
1746    "movq   %xmm1,0x8(%edi)                    \n"
1747    "movq   %xmm2,0x10(%edi)                   \n"
1748    "lea    0x18(%edi),%edi                    \n"
1749    "sub    $0x18,%ecx                         \n"
1750    "ja     1b                                 \n"
1751    "popa                                      \n"
1752    "ret                                       \n"
1753);
1754
1755void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
1756                                           uint8* dst_ptr, int dst_width);
1757  asm(
1758    DECLARE_FUNCTION(ScaleRowDown34_1_Int_SSSE3)
1759    "pusha                                     \n"
1760    "mov    0x24(%esp),%esi                    \n"
1761    "mov    0x28(%esp),%ebp                    \n"
1762    "mov    0x2c(%esp),%edi                    \n"
1763    "mov    0x30(%esp),%ecx                    \n"
1764    "movdqa _shuf01,%xmm2                      \n"
1765    "movdqa _shuf11,%xmm3                      \n"
1766    "movdqa _shuf21,%xmm4                      \n"
1767    "movdqa _madd01,%xmm5                      \n"
1768    "movdqa _madd11,%xmm6                      \n"
1769    "movdqa _round34,%xmm7                     \n"
1770
1771"1:"
1772    "movdqa (%esi),%xmm0                       \n"
1773    "movdqa (%esi,%ebp),%xmm1                  \n"
1774    "pavgb  %xmm1,%xmm0                        \n"
1775    "pshufb %xmm2,%xmm0                        \n"
1776    "pmaddubsw %xmm5,%xmm0                     \n"
1777    "paddsw %xmm7,%xmm0                        \n"
1778    "psrlw  $0x2,%xmm0                         \n"
1779    "packuswb %xmm0,%xmm0                      \n"
1780    "movq   %xmm0,(%edi)                       \n"
1781    "movdqu 0x8(%esi),%xmm0                    \n"
1782    "movdqu 0x8(%esi,%ebp),%xmm1               \n"
1783    "pavgb  %xmm1,%xmm0                        \n"
1784    "pshufb %xmm3,%xmm0                        \n"
1785    "pmaddubsw %xmm6,%xmm0                     \n"
1786    "paddsw %xmm7,%xmm0                        \n"
1787    "psrlw  $0x2,%xmm0                         \n"
1788    "packuswb %xmm0,%xmm0                      \n"
1789    "movq   %xmm0,0x8(%edi)                    \n"
1790    "movdqa 0x10(%esi),%xmm0                   \n"
1791    "movdqa 0x10(%esi,%ebp),%xmm1              \n"
1792    "lea    0x20(%esi),%esi                    \n"
1793    "pavgb  %xmm1,%xmm0                        \n"
1794    "pshufb %xmm4,%xmm0                        \n"
1795    "movdqa  _madd21,%xmm1                     \n"
1796    "pmaddubsw %xmm1,%xmm0                     \n"
1797    "paddsw %xmm7,%xmm0                        \n"
1798    "psrlw  $0x2,%xmm0                         \n"
1799    "packuswb %xmm0,%xmm0                      \n"
1800    "movq   %xmm0,0x10(%edi)                   \n"
1801    "lea    0x18(%edi),%edi                    \n"
1802    "sub    $0x18,%ecx                         \n"
1803    "ja     1b                                 \n"
1804
1805    "popa                                      \n"
1806    "ret                                       \n"
1807);
1808
1809void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
1810                                           uint8* dst_ptr, int dst_width);
1811  asm(
1812    DECLARE_FUNCTION(ScaleRowDown34_0_Int_SSSE3)
1813    "pusha                                     \n"
1814    "mov    0x24(%esp),%esi                    \n"
1815    "mov    0x28(%esp),%ebp                    \n"
1816    "mov    0x2c(%esp),%edi                    \n"
1817    "mov    0x30(%esp),%ecx                    \n"
1818    "movdqa _shuf01,%xmm2                      \n"
1819    "movdqa _shuf11,%xmm3                      \n"
1820    "movdqa _shuf21,%xmm4                      \n"
1821    "movdqa _madd01,%xmm5                      \n"
1822    "movdqa _madd11,%xmm6                      \n"
1823    "movdqa _round34,%xmm7                     \n"
1824
1825"1:"
1826    "movdqa (%esi),%xmm0                       \n"
1827    "movdqa (%esi,%ebp,1),%xmm1                \n"
1828    "pavgb  %xmm0,%xmm1                        \n"
1829    "pavgb  %xmm1,%xmm0                        \n"
1830    "pshufb %xmm2,%xmm0                        \n"
1831    "pmaddubsw %xmm5,%xmm0                     \n"
1832    "paddsw %xmm7,%xmm0                        \n"
1833    "psrlw  $0x2,%xmm0                         \n"
1834    "packuswb %xmm0,%xmm0                      \n"
1835    "movq   %xmm0,(%edi)                       \n"
1836    "movdqu 0x8(%esi),%xmm0                    \n"
1837    "movdqu 0x8(%esi,%ebp,1),%xmm1             \n"
1838    "pavgb  %xmm0,%xmm1                        \n"
1839    "pavgb  %xmm1,%xmm0                        \n"
1840    "pshufb %xmm3,%xmm0                        \n"
1841    "pmaddubsw %xmm6,%xmm0                     \n"
1842    "paddsw %xmm7,%xmm0                        \n"
1843    "psrlw  $0x2,%xmm0                         \n"
1844    "packuswb %xmm0,%xmm0                      \n"
1845    "movq   %xmm0,0x8(%edi)                    \n"
1846    "movdqa 0x10(%esi),%xmm0                   \n"
1847    "movdqa 0x10(%esi,%ebp,1),%xmm1            \n"
1848    "lea    0x20(%esi),%esi                    \n"
1849    "pavgb  %xmm0,%xmm1                        \n"
1850    "pavgb  %xmm1,%xmm0                        \n"
1851    "pshufb %xmm4,%xmm0                        \n"
1852    "movdqa  _madd21,%xmm1                     \n"
1853    "pmaddubsw %xmm1,%xmm0                     \n"
1854    "paddsw %xmm7,%xmm0                        \n"
1855    "psrlw  $0x2,%xmm0                         \n"
1856    "packuswb %xmm0,%xmm0                      \n"
1857    "movq   %xmm0,0x10(%edi)                   \n"
1858    "lea    0x18(%edi),%edi                    \n"
1859    "sub    $0x18,%ecx                         \n"
1860    "ja     1b                                 \n"
1861    "popa                                      \n"
1862    "ret                                       \n"
1863);
1864
1865#define HAS_SCALEROWDOWN38_SSSE3
1866void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
1867                                     uint8* dst_ptr, int dst_width);
1868  asm(
1869    DECLARE_FUNCTION(ScaleRowDown38_SSSE3)
1870    "pusha                                     \n"
1871    "mov    0x24(%esp),%esi                    \n"
1872    "mov    0x28(%esp),%edx                    \n"
1873    "mov    0x2c(%esp),%edi                    \n"
1874    "mov    0x30(%esp),%ecx                    \n"
1875    "movdqa _shuf38a ,%xmm4                    \n"
1876    "movdqa _shuf38b ,%xmm5                    \n"
1877
1878"1:"
1879    "movdqa (%esi),%xmm0                       \n"
1880    "movdqa 0x10(%esi),%xmm1                   \n"
1881    "lea    0x20(%esi),%esi                    \n"
1882    "pshufb %xmm4,%xmm0                        \n"
1883    "pshufb %xmm5,%xmm1                        \n"
1884    "paddusb %xmm1,%xmm0                       \n"
1885    "movq   %xmm0,(%edi)                       \n"
1886    "movhlps %xmm0,%xmm1                       \n"
1887    "movd   %xmm1,0x8(%edi)                    \n"
1888    "lea    0xc(%edi),%edi                     \n"
1889    "sub    $0xc,%ecx                          \n"
1890    "ja     1b                                 \n"
1891    "popa                                      \n"
1892    "ret                                       \n"
1893);
1894
1895void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
1896                                           uint8* dst_ptr, int dst_width);
1897  asm(
1898    DECLARE_FUNCTION(ScaleRowDown38_3_Int_SSSE3)
1899    "pusha                                     \n"
1900    "mov    0x24(%esp),%esi                    \n"
1901    "mov    0x28(%esp),%edx                    \n"
1902    "mov    0x2c(%esp),%edi                    \n"
1903    "mov    0x30(%esp),%ecx                    \n"
1904    "movdqa _shufac0,%xmm4                     \n"
1905    "movdqa _shufac3,%xmm5                     \n"
1906    "movdqa _scaleac3,%xmm6                    \n"
1907    "pxor   %xmm7,%xmm7                        \n"
1908
1909"1:"
1910    "movdqa (%esi),%xmm0                       \n"
1911    "movdqa (%esi,%edx,1),%xmm2                \n"
1912    "movhlps %xmm0,%xmm1                       \n"
1913    "movhlps %xmm2,%xmm3                       \n"
1914    "punpcklbw %xmm7,%xmm0                     \n"
1915    "punpcklbw %xmm7,%xmm1                     \n"
1916    "punpcklbw %xmm7,%xmm2                     \n"
1917    "punpcklbw %xmm7,%xmm3                     \n"
1918    "paddusw %xmm2,%xmm0                       \n"
1919    "paddusw %xmm3,%xmm1                       \n"
1920    "movdqa (%esi,%edx,2),%xmm2                \n"
1921    "lea    0x10(%esi),%esi                    \n"
1922    "movhlps %xmm2,%xmm3                       \n"
1923    "punpcklbw %xmm7,%xmm2                     \n"
1924    "punpcklbw %xmm7,%xmm3                     \n"
1925    "paddusw %xmm2,%xmm0                       \n"
1926    "paddusw %xmm3,%xmm1                       \n"
1927    "movdqa %xmm0,%xmm2                        \n"
1928    "psrldq $0x2,%xmm0                         \n"
1929    "paddusw %xmm0,%xmm2                       \n"
1930    "psrldq $0x2,%xmm0                         \n"
1931    "paddusw %xmm0,%xmm2                       \n"
1932    "pshufb %xmm4,%xmm2                        \n"
1933    "movdqa %xmm1,%xmm3                        \n"
1934    "psrldq $0x2,%xmm1                         \n"
1935    "paddusw %xmm1,%xmm3                       \n"
1936    "psrldq $0x2,%xmm1                         \n"
1937    "paddusw %xmm1,%xmm3                       \n"
1938    "pshufb %xmm5,%xmm3                        \n"
1939    "paddusw %xmm3,%xmm2                       \n"
1940    "pmulhuw %xmm6,%xmm2                       \n"
1941    "packuswb %xmm2,%xmm2                      \n"
1942    "movd   %xmm2,(%edi)                       \n"
1943    "pextrw $0x2,%xmm2,%eax                    \n"
1944    "mov    %ax,0x4(%edi)                      \n"
1945    "lea    0x6(%edi),%edi                     \n"
1946    "sub    $0x6,%ecx                          \n"
1947    "ja     1b                                 \n"
1948    "popa                                      \n"
1949    "ret                                       \n"
1950);
1951
1952void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
1953                                           uint8* dst_ptr, int dst_width);
1954  asm(
1955    DECLARE_FUNCTION(ScaleRowDown38_2_Int_SSSE3)
1956    "pusha                                     \n"
1957    "mov    0x24(%esp),%esi                    \n"
1958    "mov    0x28(%esp),%edx                    \n"
1959    "mov    0x2c(%esp),%edi                    \n"
1960    "mov    0x30(%esp),%ecx                    \n"
1961    "movdqa _shufab0,%xmm4                     \n"
1962    "movdqa _shufab1,%xmm5                     \n"
1963    "movdqa _shufab2,%xmm6                     \n"
1964    "movdqa _scaleab2,%xmm7                    \n"
1965
1966"1:"
1967    "movdqa (%esi),%xmm2                       \n"
1968    "pavgb  (%esi,%edx,1),%xmm2                \n"
1969    "lea    0x10(%esi),%esi                    \n"
1970    "movdqa %xmm2,%xmm0                        \n"
1971    "pshufb %xmm4,%xmm0                        \n"
1972    "movdqa %xmm2,%xmm1                        \n"
1973    "pshufb %xmm5,%xmm1                        \n"
1974    "paddusw %xmm1,%xmm0                       \n"
1975    "pshufb %xmm6,%xmm2                        \n"
1976    "paddusw %xmm2,%xmm0                       \n"
1977    "pmulhuw %xmm7,%xmm0                       \n"
1978    "packuswb %xmm0,%xmm0                      \n"
1979    "movd   %xmm0,(%edi)                       \n"
1980    "pextrw $0x2,%xmm0,%eax                    \n"
1981    "mov    %ax,0x4(%edi)                      \n"
1982    "lea    0x6(%edi),%edi                     \n"
1983    "sub    $0x6,%ecx                          \n"
1984    "ja     1b                                 \n"
1985    "popa                                      \n"
1986    "ret                                       \n"
1987);
1988#endif // __PIC__
1989
1990#define HAS_SCALEADDROWS_SSE2
1991void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
1992                                  uint16* dst_ptr, int src_width,
1993                                  int src_height);
1994  asm(
1995    DECLARE_FUNCTION(ScaleAddRows_SSE2)
1996    "pusha                                     \n"
1997    "mov    0x24(%esp),%esi                    \n"
1998    "mov    0x28(%esp),%edx                    \n"
1999    "mov    0x2c(%esp),%edi                    \n"
2000    "mov    0x30(%esp),%ecx                    \n"
2001    "mov    0x34(%esp),%ebx                    \n"
2002    "pxor   %xmm5,%xmm5                        \n"
2003
2004"1:"
2005    "movdqa (%esi),%xmm2                       \n"
2006    "lea    (%esi,%edx,1),%eax                 \n"
2007    "movhlps %xmm2,%xmm3                       \n"
2008    "lea    -0x1(%ebx),%ebp                    \n"
2009    "punpcklbw %xmm5,%xmm2                     \n"
2010    "punpcklbw %xmm5,%xmm3                     \n"
2011
2012"2:"
2013    "movdqa (%eax),%xmm0                       \n"
2014    "lea    (%eax,%edx,1),%eax                 \n"
2015    "movhlps %xmm0,%xmm1                       \n"
2016    "punpcklbw %xmm5,%xmm0                     \n"
2017    "punpcklbw %xmm5,%xmm1                     \n"
2018    "paddusw %xmm0,%xmm2                       \n"
2019    "paddusw %xmm1,%xmm3                       \n"
2020    "sub    $0x1,%ebp                          \n"
2021    "ja     2b                                 \n"
2022
2023    "movdqa %xmm2,(%edi)                       \n"
2024    "movdqa %xmm3,0x10(%edi)                   \n"
2025    "lea    0x20(%edi),%edi                    \n"
2026    "lea    0x10(%esi),%esi                    \n"
2027    "sub    $0x10,%ecx                         \n"
2028    "ja     1b                                 \n"
2029    "popa                                      \n"
2030    "ret                                       \n"
2031);
2032
2033// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
2034#define HAS_SCALEFILTERROWS_SSE2
2035void ScaleFilterRows_SSE2(uint8* dst_ptr,
2036                                     const uint8* src_ptr, int src_stride,
2037                                     int dst_width, int source_y_fraction);
2038  asm(
2039    DECLARE_FUNCTION(ScaleFilterRows_SSE2)
2040    "push   %esi                               \n"
2041    "push   %edi                               \n"
2042    "mov    0xc(%esp),%edi                     \n"
2043    "mov    0x10(%esp),%esi                    \n"
2044    "mov    0x14(%esp),%edx                    \n"
2045    "mov    0x18(%esp),%ecx                    \n"
2046    "mov    0x1c(%esp),%eax                    \n"
2047    "cmp    $0x0,%eax                          \n"
2048    "je     2f                                 \n"
2049    "cmp    $0x80,%eax                         \n"
2050    "je     3f                                 \n"
2051    "movd   %eax,%xmm6                         \n"
2052    "punpcklwd %xmm6,%xmm6                     \n"
2053    "pshufd $0x0,%xmm6,%xmm6                   \n"
2054    "neg    %eax                               \n"
2055    "add    $0x100,%eax                        \n"
2056    "movd   %eax,%xmm5                         \n"
2057    "punpcklwd %xmm5,%xmm5                     \n"
2058    "pshufd $0x0,%xmm5,%xmm5                   \n"
2059    "pxor   %xmm7,%xmm7                        \n"
2060
2061"1:"
2062    "movdqa (%esi),%xmm0                       \n"
2063    "movdqa (%esi,%edx,1),%xmm2                \n"
2064    "lea    0x10(%esi),%esi                    \n"
2065    "movdqa %xmm0,%xmm1                        \n"
2066    "movdqa %xmm2,%xmm3                        \n"
2067    "punpcklbw %xmm7,%xmm0                     \n"
2068    "punpcklbw %xmm7,%xmm2                     \n"
2069    "punpckhbw %xmm7,%xmm1                     \n"
2070    "punpckhbw %xmm7,%xmm3                     \n"
2071    "pmullw %xmm5,%xmm0                        \n"
2072    "pmullw %xmm5,%xmm1                        \n"
2073    "pmullw %xmm6,%xmm2                        \n"
2074    "pmullw %xmm6,%xmm3                        \n"
2075    "paddusw %xmm2,%xmm0                       \n"
2076    "paddusw %xmm3,%xmm1                       \n"
2077    "psrlw  $0x8,%xmm0                         \n"
2078    "psrlw  $0x8,%xmm1                         \n"
2079    "packuswb %xmm1,%xmm0                      \n"
2080    "movdqa %xmm0,(%edi)                       \n"
2081    "lea    0x10(%edi),%edi                    \n"
2082    "sub    $0x10,%ecx                         \n"
2083    "ja     1b                                 \n"
2084    "mov    -0x1(%edi),%al                     \n"
2085    "mov    %al,(%edi)                         \n"
2086    "pop    %edi                               \n"
2087    "pop    %esi                               \n"
2088    "ret                                       \n"
2089
2090"2:"
2091    "movdqa (%esi),%xmm0                       \n"
2092    "lea    0x10(%esi),%esi                    \n"
2093    "movdqa %xmm0,(%edi)                       \n"
2094    "lea    0x10(%edi),%edi                    \n"
2095    "sub    $0x10,%ecx                         \n"
2096    "ja     2b                                 \n"
2097
2098    "mov    -0x1(%edi),%al                     \n"
2099    "mov    %al,(%edi)                         \n"
2100    "pop    %edi                               \n"
2101    "pop    %esi                               \n"
2102    "ret                                       \n"
2103
2104"3:"
2105    "movdqa (%esi),%xmm0                       \n"
2106    "movdqa (%esi,%edx,1),%xmm2                \n"
2107    "lea    0x10(%esi),%esi                    \n"
2108    "pavgb  %xmm2,%xmm0                        \n"
2109    "movdqa %xmm0,(%edi)                       \n"
2110    "lea    0x10(%edi),%edi                    \n"
2111    "sub    $0x10,%ecx                         \n"
2112    "ja     3b                                 \n"
2113
2114    "mov    -0x1(%edi),%al                     \n"
2115    "mov    %al,(%edi)                         \n"
2116    "pop    %edi                               \n"
2117    "pop    %esi                               \n"
2118    "ret                                       \n"
2119);
2120
2121// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
2122#define HAS_SCALEFILTERROWS_SSSE3
2123void ScaleFilterRows_SSSE3(uint8* dst_ptr,
2124                                      const uint8* src_ptr, int src_stride,
2125                                      int dst_width, int source_y_fraction);
2126  asm(
2127    DECLARE_FUNCTION(ScaleFilterRows_SSSE3)
2128    "push   %esi                               \n"
2129    "push   %edi                               \n"
2130    "mov    0xc(%esp),%edi                     \n"
2131    "mov    0x10(%esp),%esi                    \n"
2132    "mov    0x14(%esp),%edx                    \n"
2133    "mov    0x18(%esp),%ecx                    \n"
2134    "mov    0x1c(%esp),%eax                    \n"
2135    "cmp    $0x0,%eax                          \n"
2136    "je     2f                                 \n"
2137    "cmp    $0x80,%eax                         \n"
2138    "je     3f                                 \n"
2139    "shr    %eax                               \n"
2140    "mov    %al,%ah                            \n"
2141    "neg    %al                                \n"
2142    "add    $0x80,%al                          \n"
2143    "movd   %eax,%xmm5                         \n"
2144    "punpcklwd %xmm5,%xmm5                     \n"
2145    "pshufd $0x0,%xmm5,%xmm5                   \n"
2146
2147"1:"
2148    "movdqa (%esi),%xmm0                       \n"
2149    "movdqa (%esi,%edx,1),%xmm2                \n"
2150    "lea    0x10(%esi),%esi                    \n"
2151    "movdqa %xmm0,%xmm1                        \n"
2152    "punpcklbw %xmm2,%xmm0                     \n"
2153    "punpckhbw %xmm2,%xmm1                     \n"
2154    "pmaddubsw %xmm5,%xmm0                     \n"
2155    "pmaddubsw %xmm5,%xmm1                     \n"
2156    "psrlw  $0x7,%xmm0                         \n"
2157    "psrlw  $0x7,%xmm1                         \n"
2158    "packuswb %xmm1,%xmm0                      \n"
2159    "movdqa %xmm0,(%edi)                       \n"
2160    "lea    0x10(%edi),%edi                    \n"
2161    "sub    $0x10,%ecx                         \n"
2162    "ja     1b                                 \n"
2163    "mov    -0x1(%edi),%al                     \n"
2164    "mov    %al,(%edi)                         \n"
2165    "pop    %edi                               \n"
2166    "pop    %esi                               \n"
2167    "ret                                       \n"
2168
2169"2:"
2170    "movdqa (%esi),%xmm0                       \n"
2171    "lea    0x10(%esi),%esi                    \n"
2172    "movdqa %xmm0,(%edi)                       \n"
2173    "lea    0x10(%edi),%edi                    \n"
2174    "sub    $0x10,%ecx                         \n"
2175    "ja     2b                                 \n"
2176    "mov    -0x1(%edi),%al                     \n"
2177    "mov    %al,(%edi)                         \n"
2178    "pop    %edi                               \n"
2179    "pop    %esi                               \n"
2180    "ret                                       \n"
2181
2182"3:"
2183    "movdqa (%esi),%xmm0                       \n"
2184    "movdqa (%esi,%edx,1),%xmm2                \n"
2185    "lea    0x10(%esi),%esi                    \n"
2186    "pavgb  %xmm2,%xmm0                        \n"
2187    "movdqa %xmm0,(%edi)                       \n"
2188    "lea    0x10(%edi),%edi                    \n"
2189    "sub    $0x10,%ecx                         \n"
2190    "ja     3b                                 \n"
2191    "mov    -0x1(%edi),%al                     \n"
2192    "mov    %al,(%edi)                         \n"
2193    "pop    %edi                               \n"
2194    "pop    %esi                               \n"
2195    "ret                                       \n"
2196);
2197
2198#elif defined(__x86_64__)
2199static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
2200                                  uint8* dst_ptr, int dst_width) {
2201  asm volatile (
2202  "lea        (%3,%3,2),%%r10                  \n"
2203  "pxor       %%xmm7,%%xmm7                    \n"
2204"1:"
2205  "movdqa     (%0),%%xmm0                      \n"
2206  "movdqa     0x10(%0),%%xmm1                  \n"
2207  "movdqa     (%0,%3,1),%%xmm2                 \n"
2208  "movdqa     0x10(%0,%3,1),%%xmm3             \n"
2209  "pavgb      %%xmm2,%%xmm0                    \n"
2210  "pavgb      %%xmm3,%%xmm1                    \n"
2211  "movdqa     (%0,%3,2),%%xmm2                 \n"
2212  "movdqa     0x10(%0,%3,2),%%xmm3             \n"
2213  "movdqa     (%0,%%r10,1),%%xmm4              \n"
2214  "movdqa     0x10(%0,%%r10,1),%%xmm5          \n"
2215  "lea        (%0,%3,4),%%r11                  \n"
2216  "lea        0x20(%0),%0                      \n"
2217  "pavgb      %%xmm4,%%xmm2                    \n"
2218  "pavgb      %%xmm5,%%xmm3                    \n"
2219  "pavgb      %%xmm2,%%xmm0                    \n"
2220  "pavgb      %%xmm3,%%xmm1                    \n"
2221  "movdqa     0x0(%%r11),%%xmm2                \n"
2222  "movdqa     0x10(%%r11),%%xmm3               \n"
2223  "movdqa     0x0(%%r11,%3,1),%%xmm4           \n"
2224  "movdqa     0x10(%%r11,%3,1),%%xmm5          \n"
2225  "pavgb      %%xmm4,%%xmm2                    \n"
2226  "pavgb      %%xmm5,%%xmm3                    \n"
2227  "movdqa     0x0(%%r11,%3,2),%%xmm4           \n"
2228  "movdqa     0x10(%%r11,%3,2),%%xmm5          \n"
2229  "movdqa     0x0(%%r11,%%r10,1),%%xmm6        \n"
2230  "pavgb      %%xmm6,%%xmm4                    \n"
2231  "movdqa     0x10(%%r11,%%r10,1),%%xmm6       \n"
2232  "pavgb      %%xmm6,%%xmm5                    \n"
2233  "pavgb      %%xmm4,%%xmm2                    \n"
2234  "pavgb      %%xmm5,%%xmm3                    \n"
2235  "pavgb      %%xmm2,%%xmm0                    \n"
2236  "pavgb      %%xmm3,%%xmm1                    \n"
2237  "psadbw     %%xmm7,%%xmm0                    \n"
2238  "psadbw     %%xmm7,%%xmm1                    \n"
2239  "pshufd     $0xd8,%%xmm0,%%xmm0              \n"
2240  "pshufd     $0x8d,%%xmm1,%%xmm1              \n"
2241  "por        %%xmm1,%%xmm0                    \n"
2242  "psrlw      $0x3,%%xmm0                      \n"
2243  "packuswb   %%xmm0,%%xmm0                    \n"
2244  "packuswb   %%xmm0,%%xmm0                    \n"
2245  "movd       %%xmm0,(%1)                      \n"
2246  "lea        0x4(%1),%1                       \n"
2247  "sub        $0x4,%2                          \n"
2248  "ja         1b                               \n"
2249  : "+r"(src_ptr),     // %0
2250    "+r"(dst_ptr),     // %1
2251    "+r"(dst_width)    // %2
2252  : "r"((intptr_t)(src_stride))   // %3
2253  : "memory", "cc", "r10", "r11", "xmm6", "xmm7"
2254);
2255}
2256
2257#define HAS_SCALEROWDOWN34_SSSE3
2258static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
2259                                 uint8* dst_ptr, int dst_width) {
2260  asm volatile (
2261  "movdqa     (%3),%%xmm3                      \n"
2262  "movdqa     (%4),%%xmm4                      \n"
2263  "movdqa     (%5),%%xmm5                      \n"
2264"1:"
2265  "movdqa     (%0),%%xmm0                      \n"
2266  "movdqa     0x10(%0),%%xmm2                  \n"
2267  "lea        0x20(%0),%0                      \n"
2268  "movdqa     %%xmm2,%%xmm1                    \n"
2269  "palignr    $0x8,%%xmm0,%%xmm1               \n"
2270  "pshufb     %%xmm3,%%xmm0                    \n"
2271  "pshufb     %%xmm4,%%xmm1                    \n"
2272  "pshufb     %%xmm5,%%xmm2                    \n"
2273  "movq       %%xmm0,(%1)                      \n"
2274  "movq       %%xmm1,0x8(%1)                   \n"
2275  "movq       %%xmm2,0x10(%1)                  \n"
2276  "lea        0x18(%1),%1                      \n"
2277  "sub        $0x18,%2                         \n"
2278  "ja         1b                               \n"
2279  : "+r"(src_ptr),     // %0
2280    "+r"(dst_ptr),     // %1
2281    "+r"(dst_width)    // %2
2282  : "r"(_shuf0),   // %3
2283    "r"(_shuf1),   // %4
2284    "r"(_shuf2)    // %5
2285  : "memory", "cc"
2286);
2287}
2288
2289static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
2290                                       uint8* dst_ptr, int dst_width) {
2291  asm volatile (
2292  "movdqa     (%4),%%xmm2                      \n"  // _shuf01
2293  "movdqa     (%5),%%xmm3                      \n"  // _shuf11
2294  "movdqa     (%6),%%xmm4                      \n"  // _shuf21
2295  "movdqa     (%7),%%xmm5                      \n"  // _madd01
2296  "movdqa     (%8),%%xmm6                      \n"  // _madd11
2297  "movdqa     (%9),%%xmm7                      \n"  // _round34
2298  "movdqa     (%10),%%xmm8                     \n"  // _madd21
2299"1:"
2300  "movdqa     (%0),%%xmm0                      \n"
2301  "movdqa     (%0,%3),%%xmm1                   \n"
2302  "pavgb      %%xmm1,%%xmm0                    \n"
2303  "pshufb     %%xmm2,%%xmm0                    \n"
2304  "pmaddubsw  %%xmm5,%%xmm0                    \n"
2305  "paddsw     %%xmm7,%%xmm0                    \n"
2306  "psrlw      $0x2,%%xmm0                      \n"
2307  "packuswb   %%xmm0,%%xmm0                    \n"
2308  "movq       %%xmm0,(%1)                      \n"
2309  "movdqu     0x8(%0),%%xmm0                   \n"
2310  "movdqu     0x8(%0,%3),%%xmm1                \n"
2311  "pavgb      %%xmm1,%%xmm0                    \n"
2312  "pshufb     %%xmm3,%%xmm0                    \n"
2313  "pmaddubsw  %%xmm6,%%xmm0                    \n"
2314  "paddsw     %%xmm7,%%xmm0                    \n"
2315  "psrlw      $0x2,%%xmm0                      \n"
2316  "packuswb   %%xmm0,%%xmm0                    \n"
2317  "movq       %%xmm0,0x8(%1)                   \n"
2318  "movdqa     0x10(%0),%%xmm0                  \n"
2319  "movdqa     0x10(%0,%3),%%xmm1               \n"
2320  "lea        0x20(%0),%0                      \n"
2321  "pavgb      %%xmm1,%%xmm0                    \n"
2322  "pshufb     %%xmm4,%%xmm0                    \n"
2323  "pmaddubsw  %%xmm8,%%xmm0                    \n"
2324  "paddsw     %%xmm7,%%xmm0                    \n"
2325  "psrlw      $0x2,%%xmm0                      \n"
2326  "packuswb   %%xmm0,%%xmm0                    \n"
2327  "movq       %%xmm0,0x10(%1)                  \n"
2328  "lea        0x18(%1),%1                      \n"
2329  "sub        $0x18,%2                         \n"
2330  "ja         1b                               \n"
2331  : "+r"(src_ptr),     // %0
2332    "+r"(dst_ptr),     // %1
2333    "+r"(dst_width)    // %2
2334  : "r"((intptr_t)(src_stride)),  // %3
2335    "r"(_shuf01),   // %4
2336    "r"(_shuf11),   // %5
2337    "r"(_shuf21),   // %6
2338    "r"(_madd01),   // %7
2339    "r"(_madd11),   // %8
2340    "r"(_round34),  // %9
2341    "r"(_madd21)    // %10
2342  : "memory", "cc", "xmm6", "xmm7", "xmm8"
2343);
2344}
2345
2346static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
2347                                       uint8* dst_ptr, int dst_width) {
2348  asm volatile (
2349  "movdqa     (%4),%%xmm2                      \n"  // _shuf01
2350  "movdqa     (%5),%%xmm3                      \n"  // _shuf11
2351  "movdqa     (%6),%%xmm4                      \n"  // _shuf21
2352  "movdqa     (%7),%%xmm5                      \n"  // _madd01
2353  "movdqa     (%8),%%xmm6                      \n"  // _madd11
2354  "movdqa     (%9),%%xmm7                      \n"  // _round34
2355  "movdqa     (%10),%%xmm8                     \n"  // _madd21
2356"1:"
2357  "movdqa     (%0),%%xmm0                      \n"
2358  "movdqa     (%0,%3,1),%%xmm1                 \n"
2359  "pavgb      %%xmm0,%%xmm1                    \n"
2360  "pavgb      %%xmm1,%%xmm0                    \n"
2361  "pshufb     %%xmm2,%%xmm0                    \n"
2362  "pmaddubsw  %%xmm5,%%xmm0                    \n"
2363  "paddsw     %%xmm7,%%xmm0                    \n"
2364  "psrlw      $0x2,%%xmm0                      \n"
2365  "packuswb   %%xmm0,%%xmm0                    \n"
2366  "movq       %%xmm0,(%1)                      \n"
2367  "movdqu     0x8(%0),%%xmm0                   \n"
2368  "movdqu     0x8(%0,%3,1),%%xmm1              \n"
2369  "pavgb      %%xmm0,%%xmm1                    \n"
2370  "pavgb      %%xmm1,%%xmm0                    \n"
2371  "pshufb     %%xmm3,%%xmm0                    \n"
2372  "pmaddubsw  %%xmm6,%%xmm0                    \n"
2373  "paddsw     %%xmm7,%%xmm0                    \n"
2374  "psrlw      $0x2,%%xmm0                      \n"
2375  "packuswb   %%xmm0,%%xmm0                    \n"
2376  "movq       %%xmm0,0x8(%1)                   \n"
2377  "movdqa     0x10(%0),%%xmm0                  \n"
2378  "movdqa     0x10(%0,%3,1),%%xmm1             \n"
2379  "lea        0x20(%0),%0                      \n"
2380  "pavgb      %%xmm0,%%xmm1                    \n"
2381  "pavgb      %%xmm1,%%xmm0                    \n"
2382  "pshufb     %%xmm4,%%xmm0                    \n"
2383  "pmaddubsw  %%xmm8,%%xmm0                    \n"
2384  "paddsw     %%xmm7,%%xmm0                    \n"
2385  "psrlw      $0x2,%%xmm0                      \n"
2386  "packuswb   %%xmm0,%%xmm0                    \n"
2387  "movq       %%xmm0,0x10(%1)                  \n"
2388  "lea        0x18(%1),%1                      \n"
2389  "sub        $0x18,%2                         \n"
2390  "ja         1b                               \n"
2391  : "+r"(src_ptr),     // %0
2392    "+r"(dst_ptr),     // %1
2393    "+r"(dst_width)    // %2
2394  : "r"((intptr_t)(src_stride)),  // %3
2395    "r"(_shuf01),   // %4
2396    "r"(_shuf11),   // %5
2397    "r"(_shuf21),   // %6
2398    "r"(_madd01),   // %7
2399    "r"(_madd11),   // %8
2400    "r"(_round34),  // %9
2401    "r"(_madd21)    // %10
2402  : "memory", "cc", "xmm6", "xmm7", "xmm8"
2403);
2404}
2405
2406#define HAS_SCALEROWDOWN38_SSSE3
2407static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
2408                                 uint8* dst_ptr, int dst_width) {
2409  asm volatile (
2410  "movdqa     (%3),%%xmm4                      \n"
2411  "movdqa     (%4),%%xmm5                      \n"
2412"1:"
2413  "movdqa     (%0),%%xmm0                      \n"
2414  "movdqa     0x10(%0),%%xmm1                  \n"
2415  "lea        0x20(%0),%0                      \n"
2416  "pshufb     %%xmm4,%%xmm0                    \n"
2417  "pshufb     %%xmm5,%%xmm1                    \n"
2418  "paddusb    %%xmm1,%%xmm0                    \n"
2419  "movq       %%xmm0,(%1)                      \n"
2420  "movhlps    %%xmm0,%%xmm1                    \n"
2421  "movd       %%xmm1,0x8(%1)                   \n"
2422  "lea        0xc(%1),%1                       \n"
2423  "sub        $0xc,%2                          \n"
2424  "ja         1b                               \n"
2425  : "+r"(src_ptr),     // %0
2426    "+r"(dst_ptr),     // %1
2427    "+r"(dst_width)    // %2
2428  : "r"(_shuf38a),  // %3
2429    "r"(_shuf38b)   // %4
2430  : "memory", "cc"
2431);
2432}
2433
2434static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
2435                                       uint8* dst_ptr, int dst_width) {
2436  asm volatile (
2437  "movdqa     (%4),%%xmm4                      \n"
2438  "movdqa     (%5),%%xmm5                      \n"
2439  "movdqa     (%6),%%xmm6                      \n"
2440  "pxor       %%xmm7,%%xmm7                    \n"
2441"1:"
2442  "movdqa     (%0),%%xmm0                      \n"
2443  "movdqa     (%0,%3,1),%%xmm2                 \n"
2444  "movhlps    %%xmm0,%%xmm1                    \n"
2445  "movhlps    %%xmm2,%%xmm3                    \n"
2446  "punpcklbw  %%xmm7,%%xmm0                    \n"
2447  "punpcklbw  %%xmm7,%%xmm1                    \n"
2448  "punpcklbw  %%xmm7,%%xmm2                    \n"
2449  "punpcklbw  %%xmm7,%%xmm3                    \n"
2450  "paddusw    %%xmm2,%%xmm0                    \n"
2451  "paddusw    %%xmm3,%%xmm1                    \n"
2452  "movdqa     (%0,%3,2),%%xmm2                 \n"
2453  "lea        0x10(%0),%0                      \n"
2454  "movhlps    %%xmm2,%%xmm3                    \n"
2455  "punpcklbw  %%xmm7,%%xmm2                    \n"
2456  "punpcklbw  %%xmm7,%%xmm3                    \n"
2457  "paddusw    %%xmm2,%%xmm0                    \n"
2458  "paddusw    %%xmm3,%%xmm1                    \n"
2459  "movdqa     %%xmm0,%%xmm2                    \n"
2460  "psrldq     $0x2,%%xmm0                      \n"
2461  "paddusw    %%xmm0,%%xmm2                    \n"
2462  "psrldq     $0x2,%%xmm0                      \n"
2463  "paddusw    %%xmm0,%%xmm2                    \n"
2464  "pshufb     %%xmm4,%%xmm2                    \n"
2465  "movdqa     %%xmm1,%%xmm3                    \n"
2466  "psrldq     $0x2,%%xmm1                      \n"
2467  "paddusw    %%xmm1,%%xmm3                    \n"
2468  "psrldq     $0x2,%%xmm1                      \n"
2469  "paddusw    %%xmm1,%%xmm3                    \n"
2470  "pshufb     %%xmm5,%%xmm3                    \n"
2471  "paddusw    %%xmm3,%%xmm2                    \n"
2472  "pmulhuw    %%xmm6,%%xmm2                    \n"
2473  "packuswb   %%xmm2,%%xmm2                    \n"
2474  "movd       %%xmm2,(%1)                      \n"
2475  "pextrw     $0x2,%%xmm2,%%eax                \n"
2476  "mov        %%ax,0x4(%1)                     \n"
2477  "lea        0x6(%1),%1                       \n"
2478  "sub        $0x6,%2                          \n"
2479  "ja         1b                               \n"
2480  : "+r"(src_ptr),     // %0
2481    "+r"(dst_ptr),     // %1
2482    "+r"(dst_width)    // %2
2483  : "r"((intptr_t)(src_stride)),  // %3
2484    "r"(_shufac0),   // %4
2485    "r"(_shufac3),   // %5
2486    "r"(_scaleac3)   // %6
2487  : "memory", "cc", "rax", "xmm6", "xmm7"
2488);
2489}
2490
2491static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
2492                                       uint8* dst_ptr, int dst_width) {
2493  asm volatile (
2494  "movdqa     (%4),%%xmm4                      \n"
2495  "movdqa     (%5),%%xmm5                      \n"
2496  "movdqa     (%6),%%xmm6                      \n"
2497  "movdqa     (%7),%%xmm7                      \n"
2498"1:"
2499  "movdqa     (%0),%%xmm2                      \n"
2500  "pavgb      (%0,%3,1),%%xmm2                 \n"
2501  "lea        0x10(%0),%0                      \n"
2502  "movdqa     %%xmm2,%%xmm0                    \n"
2503  "pshufb     %%xmm4,%%xmm0                    \n"
2504  "movdqa     %%xmm2,%%xmm1                    \n"
2505  "pshufb     %%xmm5,%%xmm1                    \n"
2506  "paddusw    %%xmm1,%%xmm0                    \n"
2507  "pshufb     %%xmm6,%%xmm2                    \n"
2508  "paddusw    %%xmm2,%%xmm0                    \n"
2509  "pmulhuw    %%xmm7,%%xmm0                    \n"
2510  "packuswb   %%xmm0,%%xmm0                    \n"
2511  "movd       %%xmm0,(%1)                      \n"
2512  "pextrw     $0x2,%%xmm0,%%eax                \n"
2513  "mov        %%ax,0x4(%1)                     \n"
2514  "lea        0x6(%1),%1                       \n"
2515  "sub        $0x6,%2                          \n"
2516  "ja         1b                               \n"
2517  : "+r"(src_ptr),     // %0
2518    "+r"(dst_ptr),     // %1
2519    "+r"(dst_width)    // %2
2520  : "r"((intptr_t)(src_stride)),  // %3
2521    "r"(_shufab0),   // %4
2522    "r"(_shufab1),   // %5
2523    "r"(_shufab2),   // %6
2524    "r"(_scaleab2)   // %7
2525  : "memory", "cc", "rax", "xmm6", "xmm7"
2526);
2527}
2528
2529#define HAS_SCALEADDROWS_SSE2
2530static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
2531                              uint16* dst_ptr, int src_width,
2532                              int src_height) {
2533  asm volatile (
2534  "pxor       %%xmm5,%%xmm5                    \n"
2535"1:"
2536  "movdqa     (%0),%%xmm2                      \n"
2537  "lea        (%0,%4,1),%%r10                  \n"
2538  "movhlps    %%xmm2,%%xmm3                    \n"
2539  "lea        -0x1(%3),%%r11                   \n"
2540  "punpcklbw  %%xmm5,%%xmm2                    \n"
2541  "punpcklbw  %%xmm5,%%xmm3                    \n"
2542
2543"2:"
2544  "movdqa     (%%r10),%%xmm0                   \n"
2545  "lea        (%%r10,%4,1),%%r10               \n"
2546  "movhlps    %%xmm0,%%xmm1                    \n"
2547  "punpcklbw  %%xmm5,%%xmm0                    \n"
2548  "punpcklbw  %%xmm5,%%xmm1                    \n"
2549  "paddusw    %%xmm0,%%xmm2                    \n"
2550  "paddusw    %%xmm1,%%xmm3                    \n"
2551  "sub        $0x1,%%r11                       \n"
2552  "ja         2b                               \n"
2553
2554  "movdqa     %%xmm2,(%1)                      \n"
2555  "movdqa     %%xmm3,0x10(%1)                  \n"
2556  "lea        0x20(%1),%1                      \n"
2557  "lea        0x10(%0),%0                      \n"
2558  "sub        $0x10,%2                         \n"
2559  "ja         1b                               \n"
2560  : "+r"(src_ptr),     // %0
2561    "+r"(dst_ptr),     // %1
2562    "+r"(src_width),   // %2
2563    "+r"(src_height)   // %3
2564  : "r"((intptr_t)(src_stride))  // %4
2565  : "memory", "cc", "r10", "r11"
2566);
2567}
2568
2569// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
2570#define HAS_SCALEFILTERROWS_SSE2
2571static void ScaleFilterRows_SSE2(uint8* dst_ptr,
2572                                 const uint8* src_ptr, int src_stride,
2573                                 int dst_width, int source_y_fraction) {
2574  if (source_y_fraction == 0) {
2575    asm volatile (
2576    "1:"
2577      "movdqa     (%1),%%xmm0                  \n"
2578      "lea        0x10(%1),%1                  \n"
2579      "movdqa     %%xmm0,(%0)                  \n"
2580      "lea        0x10(%0),%0                  \n"
2581      "sub        $0x10,%2                     \n"
2582      "ja         1b                           \n"
2583      "mov        -0x1(%0),%%al                \n"
2584      "mov        %%al,(%0)                    \n"
2585      : "+r"(dst_ptr),     // %0
2586        "+r"(src_ptr),     // %1
2587        "+r"(dst_width)    // %2
2588      :
2589      : "memory", "cc", "rax"
2590    );
2591    return;
2592  } else if (source_y_fraction == 128) {
2593    asm volatile (
2594    "1:"
2595      "movdqa     (%1),%%xmm0                  \n"
2596      "movdqa     (%1,%3,1),%%xmm2             \n"
2597      "lea        0x10(%1),%1                  \n"
2598      "pavgb      %%xmm2,%%xmm0                \n"
2599      "movdqa     %%xmm0,(%0)                  \n"
2600      "lea        0x10(%0),%0                  \n"
2601      "sub        $0x10,%2                     \n"
2602      "ja         1b                           \n"
2603      "mov        -0x1(%0),%%al                \n"
2604      "mov        %%al,(%0)                    \n"
2605      : "+r"(dst_ptr),     // %0
2606        "+r"(src_ptr),     // %1
2607        "+r"(dst_width)    // %2
2608      : "r"((intptr_t)(src_stride))  // %3
2609      : "memory", "cc", "rax"
2610    );
2611    return;
2612  } else {
2613    asm volatile (
2614      "mov        %3,%%eax                     \n"
2615      "movd       %%eax,%%xmm6                 \n"
2616      "punpcklwd  %%xmm6,%%xmm6                \n"
2617      "pshufd     $0x0,%%xmm6,%%xmm6           \n"
2618      "neg        %%eax                        \n"
2619      "add        $0x100,%%eax                 \n"
2620      "movd       %%eax,%%xmm5                 \n"
2621      "punpcklwd  %%xmm5,%%xmm5                \n"
2622      "pshufd     $0x0,%%xmm5,%%xmm5           \n"
2623      "pxor       %%xmm7,%%xmm7                \n"
2624    "1:"
2625      "movdqa     (%1),%%xmm0                  \n"
2626      "movdqa     (%1,%4,1),%%xmm2             \n"
2627      "lea        0x10(%1),%1                  \n"
2628      "movdqa     %%xmm0,%%xmm1                \n"
2629      "movdqa     %%xmm2,%%xmm3                \n"
2630      "punpcklbw  %%xmm7,%%xmm0                \n"
2631      "punpcklbw  %%xmm7,%%xmm2                \n"
2632      "punpckhbw  %%xmm7,%%xmm1                \n"
2633      "punpckhbw  %%xmm7,%%xmm3                \n"
2634      "pmullw     %%xmm5,%%xmm0                \n"
2635      "pmullw     %%xmm5,%%xmm1                \n"
2636      "pmullw     %%xmm6,%%xmm2                \n"
2637      "pmullw     %%xmm6,%%xmm3                \n"
2638      "paddusw    %%xmm2,%%xmm0                \n"
2639      "paddusw    %%xmm3,%%xmm1                \n"
2640      "psrlw      $0x8,%%xmm0                  \n"
2641      "psrlw      $0x8,%%xmm1                  \n"
2642      "packuswb   %%xmm1,%%xmm0                \n"
2643      "movdqa     %%xmm0,(%0)                  \n"
2644      "lea        0x10(%0),%0                  \n"
2645      "sub        $0x10,%2                     \n"
2646      "ja         1b                           \n"
2647      "mov        -0x1(%0),%%al                \n"
2648      "mov        %%al,(%0)                    \n"
2649      : "+r"(dst_ptr),     // %0
2650        "+r"(src_ptr),     // %1
2651        "+r"(dst_width),   // %2
2652        "+r"(source_y_fraction)  // %3
2653      : "r"((intptr_t)(src_stride))  // %4
2654      : "memory", "cc", "rax", "xmm6", "xmm7"
2655    );
2656  }
2657  return;
2658}
2659
2660// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
2661#define HAS_SCALEFILTERROWS_SSSE3
2662static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
2663                                  const uint8* src_ptr, int src_stride,
2664                                  int dst_width, int source_y_fraction) {
2665  if (source_y_fraction == 0) {
2666    asm volatile (
2667   "1:"
2668      "movdqa     (%1),%%xmm0                  \n"
2669      "lea        0x10(%1),%1                  \n"
2670      "movdqa     %%xmm0,(%0)                  \n"
2671      "lea        0x10(%0),%0                  \n"
2672      "sub        $0x10,%2                     \n"
2673      "ja         1b                           \n"
2674      "mov        -0x1(%0),%%al                \n"
2675      "mov        %%al,(%0)                    \n"
2676      : "+r"(dst_ptr),     // %0
2677        "+r"(src_ptr),     // %1
2678        "+r"(dst_width)    // %2
2679      :
2680      : "memory", "cc", "rax"
2681    );
2682    return;
2683  } else if (source_y_fraction == 128) {
2684    asm volatile (
2685    "1:"
2686      "movdqa     (%1),%%xmm0                  \n"
2687      "movdqa     (%1,%3,1),%%xmm2             \n"
2688      "lea        0x10(%1),%1                  \n"
2689      "pavgb      %%xmm2,%%xmm0                \n"
2690      "movdqa     %%xmm0,(%0)                  \n"
2691      "lea        0x10(%0),%0                  \n"
2692      "sub        $0x10,%2                     \n"
2693      "ja         1b                           \n"
2694      "mov        -0x1(%0),%%al                \n"
2695      "mov        %%al,(%0)                    \n"
2696      : "+r"(dst_ptr),     // %0
2697        "+r"(src_ptr),     // %1
2698        "+r"(dst_width)    // %2
2699      : "r"((intptr_t)(src_stride))  // %3
2700     : "memory", "cc", "rax"
2701    );
2702    return;
2703  } else {
2704    asm volatile (
2705      "mov        %3,%%eax                     \n"
2706      "shr        %%eax                        \n"
2707      "mov        %%al,%%ah                    \n"
2708      "neg        %%al                         \n"
2709      "add        $0x80,%%al                   \n"
2710      "movd       %%eax,%%xmm5                 \n"
2711      "punpcklwd  %%xmm5,%%xmm5                \n"
2712      "pshufd     $0x0,%%xmm5,%%xmm5           \n"
2713    "1:"
2714      "movdqa     (%1),%%xmm0                  \n"
2715      "movdqa     (%1,%4,1),%%xmm2             \n"
2716      "lea        0x10(%1),%1                  \n"
2717      "movdqa     %%xmm0,%%xmm1                \n"
2718      "punpcklbw  %%xmm2,%%xmm0                \n"
2719      "punpckhbw  %%xmm2,%%xmm1                \n"
2720      "pmaddubsw  %%xmm5,%%xmm0                \n"
2721      "pmaddubsw  %%xmm5,%%xmm1                \n"
2722      "psrlw      $0x7,%%xmm0                  \n"
2723      "psrlw      $0x7,%%xmm1                  \n"
2724      "packuswb   %%xmm1,%%xmm0                \n"
2725      "movdqa     %%xmm0,(%0)                  \n"
2726      "lea        0x10(%0),%0                  \n"
2727      "sub        $0x10,%2                     \n"
2728      "ja         1b                           \n"
2729      "mov        -0x1(%0),%%al                \n"
2730      "mov        %%al,(%0)                    \n"
2731      : "+r"(dst_ptr),     // %0
2732        "+r"(src_ptr),     // %1
2733        "+r"(dst_width),   // %2
2734        "+r"(source_y_fraction)  // %3
2735      : "r"((intptr_t)(src_stride))  // %4
2736      : "memory", "cc", "rax"
2737    );
2738  }
2739  return;
2740}
2741#endif
2742#endif
2743
2744// CPU agnostic row functions
2745static void ScaleRowDown2_C(const uint8* src_ptr, int src_stride,
2746                            uint8* dst, int dst_width) {
2747  int x;
2748  for (x = 0; x < dst_width; ++x) {
2749    *dst++ = *src_ptr;
2750    src_ptr += 2;
2751  }
2752}
2753
2754static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
2755                               uint8* dst, int dst_width) {
2756  int x;
2757  for (x = 0; x < dst_width; ++x) {
2758    *dst++ = (src_ptr[0] + src_ptr[1] +
2759              src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
2760    src_ptr += 2;
2761  }
2762}
2763
2764static void ScaleRowDown4_C(const uint8* src_ptr, int src_stride,
2765                            uint8* dst, int dst_width) {
2766  int x;
2767  for (x = 0; x < dst_width; ++x) {
2768    *dst++ = *src_ptr;
2769    src_ptr += 4;
2770  }
2771}
2772
2773static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
2774                               uint8* dst, int dst_width) {
2775  int x;
2776  for (x = 0; x < dst_width; ++x) {
2777    *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2778              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2779              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
2780              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
2781              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
2782              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
2783              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
2784              8) >> 4;
2785    src_ptr += 4;
2786  }
2787}
2788
2789// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
2790// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
2791// The following 2 lines cause error on Windows.
2792//static const int kMaxOutputWidth = 640;
2793//static const int kMaxRow12 = 1280;         //kMaxOutputWidth * 2;
2794#define kMaxOutputWidth   640
2795#define kMaxRow12         1280
2796
2797static void ScaleRowDown8_C(const uint8* src_ptr, int src_stride,
2798                            uint8* dst, int dst_width) {
2799  int x;
2800  for (x = 0; x < dst_width; ++x) {
2801    *dst++ = *src_ptr;
2802    src_ptr += 8;
2803  }
2804}
2805
2806// Note calling code checks width is less than max and if not
2807// uses ScaleRowDown8_C instead.
2808static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
2809                               uint8* dst, int dst_width) {
2810  ALIGN16(uint8 src_row[kMaxRow12 * 2]);
2811  assert(dst_width <= kMaxOutputWidth);
2812  ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
2813  ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
2814                     src_row + kMaxOutputWidth,
2815                     dst_width * 2);
2816  ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
2817}
2818
2819static void ScaleRowDown34_C(const uint8* src_ptr, int src_stride,
2820                             uint8* dst, int dst_width) {
2821  uint8* dend;
2822  assert((dst_width % 3 == 0) && (dst_width > 0));
2823  dend = dst + dst_width;
2824  do {
2825    dst[0] = src_ptr[0];
2826    dst[1] = src_ptr[1];
2827    dst[2] = src_ptr[3];
2828    dst += 3;
2829    src_ptr += 4;
2830  } while (dst < dend);
2831}
2832
2833// Filter rows 0 and 1 together, 3 : 1
2834static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
2835                                   uint8* d, int dst_width) {
2836  uint8* dend;
2837  const uint8* s;
2838  const uint8* t;
2839  assert((dst_width % 3 == 0) && (dst_width > 0));
2840  dend = d + dst_width;
2841  s = src_ptr;
2842  t = src_ptr + src_stride;
2843  do {
2844    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2845    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2846    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2847    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2848    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2849    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2850    d[0] = (a0 * 3 + b0 + 2) >> 2;
2851    d[1] = (a1 * 3 + b1 + 2) >> 2;
2852    d[2] = (a2 * 3 + b2 + 2) >> 2;
2853    d += 3;
2854    s += 4;
2855    t += 4;
2856  } while (d < dend);
2857}
2858
2859// Filter rows 1 and 2 together, 1 : 1
2860static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
2861                                   uint8* d, int dst_width) {
2862  uint8* dend;
2863  const uint8* s;
2864  const uint8* t;
2865  assert((dst_width % 3 == 0) && (dst_width > 0));
2866  dend = d + dst_width;
2867  s = src_ptr;
2868  t = src_ptr + src_stride;
2869  do {
2870    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2871    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2872    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2873    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2874    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2875    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2876    d[0] = (a0 + b0 + 1) >> 1;
2877    d[1] = (a1 + b1 + 1) >> 1;
2878    d[2] = (a2 + b2 + 1) >> 1;
2879    d += 3;
2880    s += 4;
2881    t += 4;
2882  } while (d < dend);
2883}
2884
2885#if defined(HAS_SCALEFILTERROWS_SSE2)
2886// Filter row to 3/4
2887static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
2888                                int dst_width) {
2889  uint8* dend;
2890  const uint8* s;
2891  assert((dst_width % 3 == 0) && (dst_width > 0));
2892  dend = dst_ptr + dst_width;
2893  s = src_ptr;
2894  do {
2895    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2896    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2897    dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2898    dst_ptr += 3;
2899    s += 4;
2900  } while (dst_ptr < dend);
2901}
2902#endif
2903
2904static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
2905                              int dst_width, int dx) {
2906  int x = 0;
2907  int j;
2908  for (j = 0; j < dst_width; ++j) {
2909    int xi = x >> 16;
2910    int xf1 = x & 0xffff;
2911    int xf0 = 65536 - xf1;
2912
2913    *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
2914    x += dx;
2915  }
2916}
2917
2918//Not work on Windows
2919//static const int kMaxInputWidth = 2560;
2920#define kMaxInputWidth    2560
2921#if defined(HAS_SCALEFILTERROWS_SSE2)
2922#define HAS_SCALEROWDOWN34_SSE2
2923// Filter rows 0 and 1 together, 3 : 1
2924static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
2925                                      uint8* dst_ptr, int dst_width) {
2926  ALIGN16(uint8 row[kMaxInputWidth]);
2927  assert((dst_width % 3 == 0) && (dst_width > 0));
2928  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
2929  ScaleFilterCols34_C(dst_ptr, row, dst_width);
2930}
2931
2932// Filter rows 1 and 2 together, 1 : 1
2933static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
2934                                      uint8* dst_ptr, int dst_width) {
2935  ALIGN16(uint8 row[kMaxInputWidth]);
2936  assert((dst_width % 3 == 0) && (dst_width > 0));
2937  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
2938  ScaleFilterCols34_C(dst_ptr, row, dst_width);
2939}
2940#endif
2941
2942static void ScaleRowDown38_C(const uint8* src_ptr, int src_stride,
2943                             uint8* dst, int dst_width) {
2944  int x;
2945  assert(dst_width % 3 == 0);
2946  for (x = 0; x < dst_width; x += 3) {
2947    dst[0] = src_ptr[0];
2948    dst[1] = src_ptr[3];
2949    dst[2] = src_ptr[6];
2950    dst += 3;
2951    src_ptr += 8;
2952  }
2953}
2954
2955// 8x3 -> 3x1
2956static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
2957                                   uint8* dst_ptr, int dst_width) {
2958  int i;
2959  assert((dst_width % 3 == 0) && (dst_width > 0));
2960  for (i = 0; i < dst_width; i+=3) {
2961    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2962        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2963        src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
2964        src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
2965        (65536 / 9) >> 16;
2966    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2967        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2968        src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
2969        src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
2970        (65536 / 9) >> 16;
2971    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2972        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
2973        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
2974        (65536 / 6) >> 16;
2975    src_ptr += 8;
2976    dst_ptr += 3;
2977  }
2978}
2979
2980// 8x2 -> 3x1
2981static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
2982                                   uint8* dst_ptr, int dst_width) {
2983  int i;
2984  assert((dst_width % 3 == 0) && (dst_width > 0));
2985  for (i = 0; i < dst_width; i+=3) {
2986    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2987        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2988        src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
2989    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2990        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2991        src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
2992    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2993        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
2994        (65536 / 4) >> 16;
2995    src_ptr += 8;
2996    dst_ptr += 3;
2997  }
2998}
2999
3000// C version 8x2 -> 8x1
3001static void ScaleFilterRows_C(uint8* dst_ptr,
3002                              const uint8* src_ptr, int src_stride,
3003                              int dst_width, int source_y_fraction) {
3004  int y1_fraction;
3005  int y0_fraction;
3006  const uint8* src_ptr1;
3007  uint8* end;
3008  assert(dst_width > 0);
3009  y1_fraction = source_y_fraction;
3010  y0_fraction = 256 - y1_fraction;
3011  src_ptr1 = src_ptr + src_stride;
3012  end = dst_ptr + dst_width;
3013  do {
3014    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
3015    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
3016    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
3017    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
3018    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
3019    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
3020    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
3021    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
3022    src_ptr += 8;
3023    src_ptr1 += 8;
3024    dst_ptr += 8;
3025  } while (dst_ptr < end);
3026  dst_ptr[0] = dst_ptr[-1];
3027}
3028
3029void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
3030                    uint16* dst_ptr, int src_width, int src_height) {
3031  int x,y;
3032  assert(src_width > 0);
3033  assert(src_height > 0);
3034  for (x = 0; x < src_width; ++x) {
3035    const uint8* s = src_ptr + x;
3036    int sum = 0;
3037    for (y = 0; y < src_height; ++y) {
3038      sum += s[0];
3039      s += src_stride;
3040    }
3041    dst_ptr[x] = sum;
3042  }
3043}
3044
3045/**
3046 * Scale plane, 1/2
3047 *
3048 * This is an optimized version for scaling down a plane to 1/2 of
3049 * its original size.
3050 *
3051 */
3052static void ScalePlaneDown2(int src_width, int src_height,
3053                            int dst_width, int dst_height,
3054                            int src_stride, int dst_stride,
3055                            const uint8* src_ptr, uint8* dst_ptr,
3056                            FilterMode filtering) {
3057  void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
3058                        uint8* dst_ptr, int dst_width);
3059  assert(IS_ALIGNED(src_width, 2));
3060  assert(IS_ALIGNED(src_height, 2));
3061
3062#if defined(HAS_SCALEROWDOWN2_NEON)
3063  if (TestCpuFlag(kCpuHasNEON) &&
3064      IS_ALIGNED(dst_width, 16)) {
3065    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
3066  } else
3067#endif
3068#if defined(HAS_SCALEROWDOWN2_SSE2)
3069  if (TestCpuFlag(kCpuHasSSE2) &&
3070      IS_ALIGNED(dst_width, 16) &&
3071      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
3072      IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
3073    ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
3074  } else
3075#endif
3076  {
3077    ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
3078  }
3079
3080  {
3081    int y;
3082    for (y = 0; y < dst_height; ++y) {
3083      ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
3084      src_ptr += (src_stride << 1);
3085      dst_ptr += dst_stride;
3086    }
3087  }
3088}
3089
3090/**
3091 * Scale plane, 1/4
3092 *
3093 * This is an optimized version for scaling down a plane to 1/4 of
3094 * its original size.
3095 */
3096static void ScalePlaneDown4(int src_width, int src_height,
3097                            int dst_width, int dst_height,
3098                            int src_stride, int dst_stride,
3099                            const uint8* src_ptr, uint8* dst_ptr,
3100                            FilterMode filtering) {
3101  void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
3102                        uint8* dst_ptr, int dst_width);
3103  assert(IS_ALIGNED(src_width, 4));
3104  assert(IS_ALIGNED(src_height, 4));
3105
3106#if defined(HAS_SCALEROWDOWN4_NEON)
3107  if (TestCpuFlag(kCpuHasNEON) &&
3108      IS_ALIGNED(dst_width, 4)) {
3109    ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
3110  } else
3111#endif
3112#if defined(HAS_SCALEROWDOWN4_SSE2)
3113  if (TestCpuFlag(kCpuHasSSE2) &&
3114      IS_ALIGNED(dst_width, 8) &&
3115      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
3116      IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
3117    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
3118  } else
3119#endif
3120  {
3121    ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
3122  }
3123
3124  {
3125    int y;
3126    for (y = 0; y < dst_height; ++y) {
3127      ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
3128      src_ptr += (src_stride << 2);
3129      dst_ptr += dst_stride;
3130    }
3131  }
3132}
3133
3134/**
3135 * Scale plane, 1/8
3136 *
3137 * This is an optimized version for scaling down a plane to 1/8
3138 * of its original size.
3139 *
3140 */
3141static void ScalePlaneDown8(int src_width, int src_height,
3142                            int dst_width, int dst_height,
3143                            int src_stride, int dst_stride,
3144                            const uint8* src_ptr, uint8* dst_ptr,
3145                            FilterMode filtering) {
3146  void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
3147                        uint8* dst_ptr, int dst_width);
3148  assert(IS_ALIGNED(src_width, 8));
3149  assert(IS_ALIGNED(src_height, 8));
3150
3151#if defined(HAS_SCALEROWDOWN8_SSE2)
3152  if (TestCpuFlag(kCpuHasSSE2) &&
3153      IS_ALIGNED(dst_width, 4) &&
3154      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
3155      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
3156    ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
3157  } else
3158#endif
3159  {
3160    ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
3161        ScaleRowDown8Int_C : ScaleRowDown8_C;
3162  }
3163
3164  {
3165    int y;
3166    for (y = 0; y < dst_height; ++y) {
3167      ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
3168      src_ptr += (src_stride << 3);
3169      dst_ptr += dst_stride;
3170    }
3171  }
3172}
3173
3174/**
3175 * Scale plane down, 3/4
3176 *
3177 * Provided by Frank Barchard (fbarchard@google.com)
3178 *
3179 */
3180static void ScalePlaneDown34(int src_width, int src_height,
3181                             int dst_width, int dst_height,
3182                             int src_stride, int dst_stride,
3183                             const uint8* src_ptr, uint8* dst_ptr,
3184                             FilterMode filtering) {
3185  void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
3186                           uint8* dst_ptr, int dst_width);
3187  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
3188                           uint8* dst_ptr, int dst_width);
3189  assert(dst_width % 3 == 0);
3190#if defined(HAS_SCALEROWDOWN34_NEON)
3191  if (TestCpuFlag(kCpuHasNEON) &&
3192      (dst_width % 24 == 0)) {
3193    if (!filtering) {
3194      ScaleRowDown34_0 = ScaleRowDown34_NEON;
3195      ScaleRowDown34_1 = ScaleRowDown34_NEON;
3196    } else {
3197      ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
3198      ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
3199    }
3200  } else
3201#endif
3202
3203#if defined(HAS_SCALEROWDOWN34_SSSE3)
3204  if (TestCpuFlag(kCpuHasSSSE3) &&
3205      (dst_width % 24 == 0) &&
3206      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
3207      IS_ALIGNED(dst_ptr, 8) && IS_ALIGNED(dst_stride, 8)) {
3208    if (!filtering) {
3209      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
3210      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
3211    } else {
3212      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
3213      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
3214    }
3215  } else
3216#endif
3217#if defined(HAS_SCALEROWDOWN34_SSE2)
3218  if (TestCpuFlag(kCpuHasSSE2) &&
3219      (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
3220      IS_ALIGNED(dst_stride, 8) &&
3221      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
3222      filtering) {
3223    ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
3224    ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
3225  } else
3226#endif
3227  {
3228    if (!filtering) {
3229      ScaleRowDown34_0 = ScaleRowDown34_C;
3230      ScaleRowDown34_1 = ScaleRowDown34_C;
3231    } else {
3232      ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
3233      ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
3234    }
3235  }
3236  {
3237  int src_row = 0;
3238    int y;
3239    for (y = 0; y < dst_height; ++y) {
3240    switch (src_row) {
3241      case 0:
3242        ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
3243        break;
3244
3245      case 1:
3246        ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
3247        break;
3248
3249      case 2:
3250        ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
3251                         dst_ptr, dst_width);
3252        break;
3253    }
3254    ++src_row;
3255    src_ptr += src_stride;
3256    dst_ptr += dst_stride;
3257    if (src_row >= 3) {
3258      src_ptr += src_stride;
3259      src_row = 0;
3260    }
3261  }
3262}
3263}
3264
3265/**
3266 * Scale plane, 3/8
3267 *
3268 * This is an optimized version for scaling down a plane to 3/8
3269 * of its original size.
3270 *
3271 * Reduces 16x3 to 6x1
3272 */
3273static void ScalePlaneDown38(int src_width, int src_height,
3274                             int dst_width, int dst_height,
3275                             int src_stride, int dst_stride,
3276                             const uint8* src_ptr, uint8* dst_ptr,
3277                             FilterMode filtering) {
3278  void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
3279                           uint8* dst_ptr, int dst_width);
3280  void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
3281                           uint8* dst_ptr, int dst_width);
3282  assert(dst_width % 3 == 0);
3283#if defined(HAS_SCALEROWDOWN38_NEON)
3284  if (TestCpuFlag(kCpuHasNEON) &&
3285      (dst_width % 12 == 0)) {
3286    if (!filtering) {
3287      ScaleRowDown38_3 = ScaleRowDown38_NEON;
3288      ScaleRowDown38_2 = ScaleRowDown38_NEON;
3289    } else {
3290      ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
3291      ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
3292    }
3293  } else
3294#endif
3295
3296#if defined(HAS_SCALEROWDOWN38_SSSE3)
3297  if (TestCpuFlag(kCpuHasSSSE3) &&
3298      (dst_width % 24 == 0) && IS_ALIGNED(src_stride, 16) &&
3299      IS_ALIGNED(dst_stride, 8) &&
3300      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
3301    if (!filtering) {
3302      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
3303      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
3304    } else {
3305      ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
3306      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
3307    }
3308  } else
3309#endif
3310  {
3311    if (!filtering) {
3312      ScaleRowDown38_3 = ScaleRowDown38_C;
3313      ScaleRowDown38_2 = ScaleRowDown38_C;
3314    } else {
3315      ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
3316      ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
3317    }
3318  }
3319  {
3320  int src_row = 0;
3321    int y;
3322    for (y = 0; y < dst_height; ++y) {
3323    switch (src_row) {
3324      case 0:
3325      case 1:
3326        ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
3327        src_ptr += src_stride * 3;
3328        ++src_row;
3329        break;
3330
3331      case 2:
3332        ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
3333        src_ptr += src_stride * 2;
3334        src_row = 0;
3335        break;
3336    }
3337    dst_ptr += dst_stride;
3338  }
3339}
3340}
3341
3342__inline static uint32 SumBox(int iboxwidth, int iboxheight,
3343                            int src_stride, const uint8* src_ptr) {
3344  int x, y;
3345  uint32 sum;
3346  assert(iboxwidth > 0);
3347  assert(iboxheight > 0);
3348  sum = 0u;
3349  for (y = 0; y < iboxheight; ++y) {
3350    for (x = 0; x < iboxwidth; ++x) {
3351      sum += src_ptr[x];
3352    }
3353    src_ptr += src_stride;
3354  }
3355  return sum;
3356}
3357
3358static void ScalePlaneBoxRow(int dst_width, int boxheight,
3359                             int dx, int src_stride,
3360                             const uint8* src_ptr, uint8* dst_ptr) {
3361  int x = 0;
3362  int i;
3363  for (i = 0; i < dst_width; ++i) {
3364    int ix = x >> 16;
3365    int boxwidth;
3366    x += dx;
3367    boxwidth = (x >> 16) - ix;
3368    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
3369        (boxwidth * boxheight);
3370  }
3371}
3372
3373__inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
3374  uint32 sum;
3375  int x;
3376  assert(iboxwidth > 0);
3377  sum = 0u;
3378  for (x = 0; x < iboxwidth; ++x) {
3379    sum += src_ptr[x];
3380  }
3381  return sum;
3382}
3383
3384static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
3385                            const uint16* src_ptr, uint8* dst_ptr) {
3386  int scaletbl[2];
3387  int minboxwidth = (dx >> 16);
3388  scaletbl[0] = 65536 / (minboxwidth * boxheight);
3389  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
3390  {
3391  int *scaleptr = scaletbl - minboxwidth;
3392  int x = 0;
3393    int i;
3394    for (i = 0; i < dst_width; ++i) {
3395    int ix = x >> 16;
3396      int boxwidth;
3397    x += dx;
3398      boxwidth = (x >> 16) - ix;
3399    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
3400    }
3401  }
3402}
3403
3404static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
3405                            const uint16* src_ptr, uint8* dst_ptr) {
3406  int boxwidth = (dx >> 16);
3407  int scaleval = 65536 / (boxwidth * boxheight);
3408  int x = 0;
3409  int i;
3410  for (i = 0; i < dst_width; ++i) {
3411    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
3412    x += boxwidth;
3413  }
3414}
3415
3416/**
3417 * Scale plane down to any dimensions, with interpolation.
3418 * (boxfilter).
3419 *
3420 * Same method as SimpleScale, which is fixed point, outputting
3421 * one pixel of destination using fixed point (16.16) to step
3422 * through source, sampling a box of pixel with simple
3423 * averaging.
3424 */
3425static void ScalePlaneBox(int src_width, int src_height,
3426                          int dst_width, int dst_height,
3427                          int src_stride, int dst_stride,
3428                          const uint8* src_ptr, uint8* dst_ptr) {
3429  int dx, dy;
3430  assert(dst_width > 0);
3431  assert(dst_height > 0);
3432  dy = (src_height << 16) / dst_height;
3433  dx = (src_width << 16) / dst_width;
3434  if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
3435      dst_height * 2 > src_height) {
3436    uint8* dst = dst_ptr;
3437    int dy = (src_height << 16) / dst_height;
3438    int dx = (src_width << 16) / dst_width;
3439    int y = 0;
3440    int j;
3441    for (j = 0; j < dst_height; ++j) {
3442      int iy = y >> 16;
3443      const uint8* const src = src_ptr + iy * src_stride;
3444      int boxheight;
3445      y += dy;
3446      if (y > (src_height << 16)) {
3447        y = (src_height << 16);
3448      }
3449      boxheight = (y >> 16) - iy;
3450      ScalePlaneBoxRow(dst_width, boxheight,
3451                       dx, src_stride,
3452                       src, dst);
3453
3454      dst += dst_stride;
3455    }
3456  } else {
3457    ALIGN16(uint16 row[kMaxInputWidth]);
3458    void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
3459                         uint16* dst_ptr, int src_width, int src_height);
3460    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
3461                         const uint16* src_ptr, uint8* dst_ptr);
3462#if defined(HAS_SCALEADDROWS_SSE2)
3463    if (TestCpuFlag(kCpuHasSSE2) &&
3464        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
3465        IS_ALIGNED(src_width, 16)) {
3466      ScaleAddRows = ScaleAddRows_SSE2;
3467    } else
3468#endif
3469    {
3470      ScaleAddRows = ScaleAddRows_C;
3471    }
3472    if (dx & 0xffff) {
3473      ScaleAddCols = ScaleAddCols2_C;
3474    } else {
3475      ScaleAddCols = ScaleAddCols1_C;
3476    }
3477
3478    {
3479    int y = 0;
3480      int j;
3481      for (j = 0; j < dst_height; ++j) {
3482      int iy = y >> 16;
3483      const uint8* const src = src_ptr + iy * src_stride;
3484        int boxheight;
3485      y += dy;
3486      if (y > (src_height << 16)) {
3487        y = (src_height << 16);
3488      }
3489        boxheight = (y >> 16) - iy;
3490      ScaleAddRows(src, src_stride, row, src_width, boxheight);
3491      ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
3492      dst_ptr += dst_stride;
3493      }
3494    }
3495  }
3496}
3497
3498/**
3499 * Scale plane to/from any dimensions, with interpolation.
3500 */
3501static void ScalePlaneBilinearSimple(int src_width, int src_height,
3502                                     int dst_width, int dst_height,
3503                                     int src_stride, int dst_stride,
3504                                     const uint8* src_ptr, uint8* dst_ptr) {
3505  int i, j;
3506  uint8* dst = dst_ptr;
3507  int dx = (src_width << 16) / dst_width;
3508  int dy = (src_height << 16) / dst_height;
3509  int maxx = ((src_width - 1) << 16) - 1;
3510  int maxy = ((src_height - 1) << 16) - 1;
3511  int y = (dst_height < src_height) ? 32768 :
3512      (src_height << 16) / dst_height - 32768;
3513  for (i = 0; i < dst_height; ++i) {
3514    int cy = (y < 0) ? 0 : y;
3515    int yi = cy >> 16;
3516    int yf = cy & 0xffff;
3517    const uint8* const src = src_ptr + yi * src_stride;
3518    int x = (dst_width < src_width) ? 32768 :
3519        (src_width << 16) / dst_width - 32768;
3520    for (j = 0; j < dst_width; ++j) {
3521      int cx = (x < 0) ? 0 : x;
3522      int xi = cx >> 16;
3523      int xf = cx & 0xffff;
3524      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
3525      int r1 = (src[xi + src_stride] * (65536 - xf) +
3526          src[xi + src_stride + 1] * xf) >> 16;
3527      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
3528      x += dx;
3529      if (x > maxx)
3530        x = maxx;
3531    }
3532    dst += dst_stride - dst_width;
3533    y += dy;
3534    if (y > maxy)
3535      y = maxy;
3536  }
3537}
3538
3539/**
3540 * Scale plane to/from any dimensions, with bilinear
3541 * interpolation.
3542 */
3543static void ScalePlaneBilinear(int src_width, int src_height,
3544                               int dst_width, int dst_height,
3545                               int src_stride, int dst_stride,
3546                               const uint8* src_ptr, uint8* dst_ptr) {
3547  int dy;
3548  int dx;
3549  assert(dst_width > 0);
3550  assert(dst_height > 0);
3551  dy = (src_height << 16) / dst_height;
3552  dx = (src_width << 16) / dst_width;
3553  if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
3554    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
3555                             src_stride, dst_stride, src_ptr, dst_ptr);
3556
3557  } else {
3558    ALIGN16(uint8 row[kMaxInputWidth + 1]);
3559    void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
3560                            int src_stride,
3561                            int dst_width, int source_y_fraction);
3562    void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
3563                            int dst_width, int dx);
3564#if defined(HAS_SCALEFILTERROWS_SSSE3)
3565    if (TestCpuFlag(kCpuHasSSSE3) &&
3566        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
3567        IS_ALIGNED(src_width, 16)) {
3568      ScaleFilterRows = ScaleFilterRows_SSSE3;
3569    } else
3570#endif
3571#if defined(HAS_SCALEFILTERROWS_SSE2)
3572    if (TestCpuFlag(kCpuHasSSE2) &&
3573        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16) &&
3574        IS_ALIGNED(src_width, 16)) {
3575      ScaleFilterRows = ScaleFilterRows_SSE2;
3576    } else
3577#endif
3578    {
3579      ScaleFilterRows = ScaleFilterRows_C;
3580    }
3581    ScaleFilterCols = ScaleFilterCols_C;
3582
3583    {
3584    int y = 0;
3585    int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
3586      int j;
3587      for (j = 0; j < dst_height; ++j) {
3588      int iy = y >> 16;
3589      int fy = (y >> 8) & 255;
3590      const uint8* const src = src_ptr + iy * src_stride;
3591      ScaleFilterRows(row, src, src_stride, src_width, fy);
3592      ScaleFilterCols(dst_ptr, row, dst_width, dx);
3593      dst_ptr += dst_stride;
3594      y += dy;
3595      if (y > maxy) {
3596        y = maxy;
3597      }
3598    }
3599  }
3600}
3601}
3602
3603/**
3604 * Scale plane to/from any dimensions, without interpolation.
3605 * Fixed point math is used for performance: The upper 16 bits
3606 * of x and dx is the integer part of the source position and
3607 * the lower 16 bits are the fixed decimal part.
3608 */
3609static void ScalePlaneSimple(int src_width, int src_height,
3610                             int dst_width, int dst_height,
3611                             int src_stride, int dst_stride,
3612                             const uint8* src_ptr, uint8* dst_ptr) {
3613  uint8* dst = dst_ptr;
3614  int dx = (src_width << 16) / dst_width;
3615  int y;
3616  for (y = 0; y < dst_height; ++y) {
3617    const uint8* const src = src_ptr + (y * src_height / dst_height) *
3618        src_stride;
3619    // TODO(fbarchard): Round X coordinate by setting x=0x8000.
3620    int x = 0;
3621    int i;
3622    for (i = 0; i < dst_width; ++i) {
3623      *dst++ = src[x >> 16];
3624      x += dx;
3625    }
3626    dst += dst_stride - dst_width;
3627  }
3628}
3629
3630/**
3631 * Scale plane to/from any dimensions.
3632 */
3633static void ScalePlaneAnySize(int src_width, int src_height,
3634                              int dst_width, int dst_height,
3635                              int src_stride, int dst_stride,
3636                              const uint8* src_ptr, uint8* dst_ptr,
3637                              FilterMode filtering) {
3638  if (!filtering) {
3639    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3640                     src_stride, dst_stride, src_ptr, dst_ptr);
3641  } else {
3642    // fall back to non-optimized version
3643    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3644                       src_stride, dst_stride, src_ptr, dst_ptr);
3645  }
3646}
3647
3648/**
3649 * Scale plane down, any size
3650 *
3651 * This is an optimized version for scaling down a plane to any size.
3652 * The current implementation is ~10 times faster compared to the
3653 * reference implementation for e.g. XGA->LowResPAL
3654 *
3655 */
3656static void ScalePlaneDown(int src_width, int src_height,
3657                           int dst_width, int dst_height,
3658                           int src_stride, int dst_stride,
3659                           const uint8* src_ptr, uint8* dst_ptr,
3660                           FilterMode filtering) {
3661  if (!filtering) {
3662    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3663                     src_stride, dst_stride, src_ptr, dst_ptr);
3664  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
3665    // between 1/2x and 1x use bilinear
3666    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3667                       src_stride, dst_stride, src_ptr, dst_ptr);
3668  } else {
3669    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
3670                  src_stride, dst_stride, src_ptr, dst_ptr);
3671  }
3672}
3673
3674/**
3675 * Copy plane, no scaling
3676 *
3677 * This simply copies the given plane without scaling.
3678 * The current implementation is ~115 times faster
3679 * compared to the reference implementation.
3680 *
3681 */
3682static void CopyPlane(int src_width, int src_height,
3683                      int dst_width, int dst_height,
3684                      int src_stride, int dst_stride,
3685                      const uint8* src_ptr, uint8* dst_ptr) {
3686  if (src_stride == src_width && dst_stride == dst_width) {
3687    // All contiguous, so can use REALLY fast path.
3688    memcpy(dst_ptr, src_ptr, src_width * src_height);
3689  } else {
3690    // Not all contiguous; must copy scanlines individually
3691    const uint8* src = src_ptr;
3692    uint8* dst = dst_ptr;
3693    int i;
3694    for (i = 0; i < src_height; ++i) {
3695      memcpy(dst, src, src_width);
3696      dst += dst_stride;
3697      src += src_stride;
3698    }
3699  }
3700}
3701
3702static void ScalePlane(const uint8* src, int src_stride,
3703                       int src_width, int src_height,
3704                       uint8* dst, int dst_stride,
3705                       int dst_width, int dst_height,
3706                       FilterMode filtering, int use_ref) {
3707  // Use specialized scales to improve performance for common resolutions.
3708  // For example, all the 1/2 scalings will use ScalePlaneDown2()
3709  if (dst_width == src_width && dst_height == src_height) {
3710    // Straight copy.
3711    CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
3712              dst_stride, src, dst);
3713  } else if (dst_width <= src_width && dst_height <= src_height) {
3714    // Scale down.
3715    if (use_ref) {
3716      // For testing, allow the optimized versions to be disabled.
3717      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3718                     src_stride, dst_stride, src, dst, filtering);
3719    } else if (4 * dst_width == 3 * src_width &&
3720               4 * dst_height == 3 * src_height) {
3721      // optimized, 3/4
3722      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
3723                       src_stride, dst_stride, src, dst, filtering);
3724    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
3725      // optimized, 1/2
3726      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
3727                      src_stride, dst_stride, src, dst, filtering);
3728    // 3/8 rounded up for odd sized chroma height.
3729    } else if (8 * dst_width == 3 * src_width &&
3730               dst_height == ((src_height * 3 + 7) / 8)) {
3731      // optimized, 3/8
3732      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
3733                       src_stride, dst_stride, src, dst, filtering);
3734    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
3735      // optimized, 1/4
3736      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
3737                      src_stride, dst_stride, src, dst, filtering);
3738    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
3739      // optimized, 1/8
3740      ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
3741                      src_stride, dst_stride, src, dst, filtering);
3742    } else {
3743      // Arbitrary downsample
3744      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3745                     src_stride, dst_stride, src, dst, filtering);
3746    }
3747  } else {
3748    // Arbitrary scale up and/or down.
3749    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
3750                      src_stride, dst_stride, src, dst, filtering);
3751  }
3752}
3753
3754/**
3755 * Scale a plane.
3756 *
3757 * This function in turn calls a scaling function
3758 * suitable for handling the desired resolutions.
3759 *
3760 */
3761
3762int I420Scale(const uint8* src_y, int src_stride_y,
3763              const uint8* src_u, int src_stride_u,
3764              const uint8* src_v, int src_stride_v,
3765              int src_width, int src_height,
3766              uint8* dst_y, int dst_stride_y,
3767              uint8* dst_u, int dst_stride_u,
3768              uint8* dst_v, int dst_stride_v,
3769              int dst_width, int dst_height,
3770              FilterMode filtering) {
3771  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3772      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3773    return -1;
3774  }
3775  // Negative height means invert the image.
3776  if (src_height < 0) {
3777    int halfheight;
3778    src_height = -src_height;
3779    halfheight = (src_height + 1) >> 1;
3780    src_y = src_y + (src_height - 1) * src_stride_y;
3781    src_u = src_u + (halfheight - 1) * src_stride_u;
3782    src_v = src_v + (halfheight - 1) * src_stride_v;
3783    src_stride_y = -src_stride_y;
3784    src_stride_u = -src_stride_u;
3785    src_stride_v = -src_stride_v;
3786  }
3787  {
3788  int src_halfwidth = (src_width + 1) >> 1;
3789  int src_halfheight = (src_height + 1) >> 1;
3790  int dst_halfwidth = (dst_width + 1) >> 1;
3791  int dst_halfheight = (dst_height + 1) >> 1;
3792
3793    ScalePlane(src_y, src_stride_y, src_width, src_height,
3794               dst_y, dst_stride_y, dst_width, dst_height,
3795               filtering, use_reference_impl_);
3796  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
3797             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
3798             filtering, use_reference_impl_);
3799  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
3800             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
3801             filtering, use_reference_impl_);
3802  }
3803  return 0;
3804}
3805
3806// Deprecated api
3807int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
3808          int src_stride_y, int src_stride_u, int src_stride_v,
3809          int src_width, int src_height,
3810          uint8* dst_y, uint8* dst_u, uint8* dst_v,
3811          int dst_stride_y, int dst_stride_u, int dst_stride_v,
3812          int dst_width, int dst_height,
3813          int interpolate) {
3814  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3815      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3816    return -1;
3817  }
3818  // Negative height means invert the image.
3819  if (src_height < 0) {
3820    int halfheight;
3821    src_height = -src_height;
3822    halfheight = (src_height + 1) >> 1;
3823    src_y = src_y + (src_height - 1) * src_stride_y;
3824    src_u = src_u + (halfheight - 1) * src_stride_u;
3825    src_v = src_v + (halfheight - 1) * src_stride_v;
3826    src_stride_y = -src_stride_y;
3827    src_stride_u = -src_stride_u;
3828    src_stride_v = -src_stride_v;
3829  }
3830  {
3831  int src_halfwidth = (src_width + 1) >> 1;
3832  int src_halfheight = (src_height + 1) >> 1;
3833  int dst_halfwidth = (dst_width + 1) >> 1;
3834  int dst_halfheight = (dst_height + 1) >> 1;
3835  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
3836
3837  ScalePlane(src_y, src_stride_y, src_width, src_height,
3838             dst_y, dst_stride_y, dst_width, dst_height,
3839             filtering, use_reference_impl_);
3840  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
3841             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
3842             filtering, use_reference_impl_);
3843  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
3844             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
3845             filtering, use_reference_impl_);
3846  }
3847  return 0;
3848}
3849
3850// Deprecated api
3851int ScaleOffset(const uint8* src, int src_width, int src_height,
3852                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
3853          int interpolate) {
3854  if (!src || src_width <= 0 || src_height <= 0 ||
3855      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
3856      dst_yoffset >= dst_height) {
3857    return -1;
3858  }
3859  dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
3860  {
3861  int src_halfwidth = (src_width + 1) >> 1;
3862  int src_halfheight = (src_height + 1) >> 1;
3863  int dst_halfwidth = (dst_width + 1) >> 1;
3864  int dst_halfheight = (dst_height + 1) >> 1;
3865  int aheight = dst_height - dst_yoffset * 2;  // actual output height
3866  const uint8* const src_y = src;
3867  const uint8* const src_u = src + src_width * src_height;
3868  const uint8* const src_v = src + src_width * src_height +
3869                             src_halfwidth * src_halfheight;
3870  uint8* dst_y = dst + dst_yoffset * dst_width;
3871  uint8* dst_u = dst + dst_width * dst_height +
3872                 (dst_yoffset >> 1) * dst_halfwidth;
3873  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
3874                 (dst_yoffset >> 1) * dst_halfwidth;
3875  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
3876               src_width, src_height, dst_y, dst_u, dst_v, dst_width,
3877               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
3878  }
3879}
3880
3881#ifdef __cplusplus
3882}  // extern "C"
3883}  // namespace libyuv
3884#endif
3885