1/*
2 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/scale.h"
12
13#include <assert.h>
14#include <string.h>
15
16#include "libyuv/cpu_id.h"
17
18#if defined(_MSC_VER)
19#define ALIGN16(var) __declspec(align(16)) var
20#else
21#define ALIGN16(var) var __attribute__((aligned(16)))
22#endif
23
24// Note: A Neon reference manual
25// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0204j/CJAJIIGG.html
26// Note: Some SSE2 reference manuals
27// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
28
29namespace libyuv {
30
31// Set the following flag to true to revert to only
32// using the reference implementation ScalePlaneBox(), and
33// NOT the optimized versions. Useful for debugging and
34// when comparing the quality of the resulting YUV planes
35// as produced by the optimized and non-optimized versions.
36
37static bool use_reference_impl_ = false;
38
39void SetUseReferenceImpl(bool use) {
40  use_reference_impl_ = use;
41}
42
43/**
44 * NEON downscalers with interpolation.
45 *
46 * Provided by Fritz Koenig
47 *
48 */
49
50#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
51#define HAS_SCALEROWDOWN2_NEON
52void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
53                        uint8* dst, int dst_width) {
54  __asm__ volatile
55  (
56    "1:\n"
57    "vld2.u8    {q0,q1}, [%0]!    \n"  // load even pixels into q0, odd into q1
58    "vst1.u8    {q0}, [%1]!       \n"  // store even pixels
59    "subs       %2, %2, #16       \n"  // 16 processed per loop
60    "bhi        1b                \n"
61    : "+r"(src_ptr),          // %0
62      "+r"(dst),              // %1
63      "+r"(dst_width)         // %2
64    :
65    : "q0", "q1"              // Clobber List
66  );
67}
68
69void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
70                           uint8* dst, int dst_width) {
71  __asm__ volatile
72  (
73    "mov        r4, #2            \n"  // rounding constant
74    "add        %1, %0            \n"  // change the stride to row 2 pointer
75    "vdup.16    q4, r4            \n"
76    "1:\n"
77    "vld1.u8    {q0,q1}, [%0]!    \n"  // load row 1 and post increment
78    "vld1.u8    {q2,q3}, [%1]!    \n"  // load row 2 and post increment
79    "vpaddl.u8  q0, q0            \n"  // row 1 add adjacent
80    "vpaddl.u8  q1, q1            \n"
81    "vpadal.u8  q0, q2            \n"  // row 2 add adjacent, add row 1 to row 2
82    "vpadal.u8  q1, q3            \n"
83    "vadd.u16   q0, q4            \n"  // rounding
84    "vadd.u16   q1, q4            \n"
85    "vshrn.u16  d0, q0, #2        \n"  // downshift and pack
86    "vshrn.u16  d1, q1, #2        \n"
87    "vst1.u8    {q0}, [%2]!       \n"
88    "subs       %3, %3, #16       \n"  // 16 processed per loop
89    "bhi        1b                \n"
90    : "+r"(src_ptr),          // %0
91      "+r"(src_stride),       // %1
92      "+r"(dst),              // %2
93      "+r"(dst_width)         // %3
94    :
95    : "r4", "q0", "q1", "q2", "q3", "q4"              // Clobber List
96   );
97}
98
99#define HAS_SCALEROWDOWN4_NEON
100// Expecting widths on arm devices to be smaller.  Went with 8x4 blocks
101//  to get most coverage.  Look to back and evaluate 16x4 blocks with
102//  handling of leftovers.
103static void ScaleRowDown4_NEON(const uint8* src_ptr, int /* src_stride */,
104                               uint8* dst_ptr, int dst_width) {
105  __asm__ volatile
106  (
107    "mov        r4, #4            \n"
108    "1:                           \n"
109    "vld1.u8    {d0[0]}, [%0],r4  \n"   // load up only 2 pixels of data to
110    "vld1.u8    {d0[1]}, [%0],r4  \n"   //  represent the entire 8x4 block
111
112    "vst1.u16   {d0[0]}, [%1]!    \n"
113
114    "subs       %2, #2            \n"   // dst_width -= 2
115    "bhi        1b                \n"
116    : "+r"(src_ptr),          // %0
117      "+r"(dst_ptr),          // %1
118      "+r"(dst_width)         // %2
119    :
120    : "r4", "q0", "q1", "memory", "cc"
121  );
122}
123
124static void ScaleRowDown4Int_NEON(const uint8* src_ptr, int src_stride,
125                                  uint8* dst_ptr, int dst_width) {
126  __asm__ volatile
127  (
128    "1:                           \n"
129    "mov        r4, %0            \n"
130    "vld1.u8    {d0}, [r4],%3     \n"   // load up 8x4 block of input data
131    "vld1.u8    {d1}, [r4],%3     \n"
132    "vld1.u8    {d2}, [r4],%3     \n"
133    "vld1.u8    {d3}, [r4]        \n"
134
135    // data is loaded up int q0 and q1
136    // q0 = a00 a01 a02 a03 b00 b01 b02 b03 a10 a11 a12 a13 b10 b11 b12 b13
137    // q1 = a20 a21 a22 a23 b20 b21 b22 b23 a20 a21 a22 a23 b20 b21 b22 b23
138    // q0 = a00+a01 a02+a03 b00+b01 b02+b03 a10+a11 a12+a13 b10+b11 b12+b13
139    "vpaddl.u8  q0, q0            \n"
140
141    // d0 = a00+a01+a20+a21 a02+a03+a22+a23 b00+b01+b20+b21 b02+b03+b22+b23
142    // d1 = a10+a11+a20+a21 a12+a13+a22+a23 b10+b11+b20+b21 b12+b13+b22+b23
143    "vpadal.u8  q0, q1            \n"
144
145    // d0 = a00+a01+a20+a21+a02+a03+a22+a23 b00+b01+b20+b21+b02+b03+b22+b23
146    // d1 = a10+a11+a20+a21+a12+a13+a22+a23 b10+b11+b20+b21+b12+b13+b22+b23
147    "vpaddl.u16 q0, q0            \n"
148
149
150    // d0 = a00+a01+a20+a21+a02+a03+a22+a23+a10+a11+a20+a21+a12+a13+a22+a23
151    //      b00+b01+b20+b21+b02+b03+b22+b23+b10+b11+b20+b21+b12+b13+b22+b23
152    "vadd.u32   d0, d1            \n"
153
154    "vrshr.u32  d0, d0, #4        \n"   // divide by 16 w/rounding
155
156    "vst1.u8    {d0[0]}, [%1]!    \n"
157    "vst1.u8    {d0[4]}, [%1]!    \n"
158
159    "add        %0, #8            \n"   // move src pointer to next 8 pixels
160    "subs       %2, #2            \n"   // dst_width -= 2
161    "bhi        1b                \n"
162
163    : "+r"(src_ptr),          // %0
164      "+r"(dst_ptr),          // %1
165      "+r"(dst_width)         // %2
166    : "r"(src_stride)         // %3
167    : "r4", "q0", "q1", "memory", "cc"
168  );
169}
170
171/**
172 * SSE2 downscalers with interpolation.
173 *
174 * Provided by Frank Barchard (fbarchard@google.com)
175 *
176 */
177
178// Constants for SSE2 code
179#elif (defined(WIN32) || defined(__i386__) || defined(__x86_64__)) && \
180    !defined(COVERAGE_ENABLED) && !TARGET_IPHONE_SIMULATOR
181#if defined(_MSC_VER)
182#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
183#elif defined(OSX)
184#define TALIGN16(t, var) t var __attribute__((aligned(16)))
185#else
186#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
187#endif
188
189// Offsets for source bytes 0 to 9
190extern "C" TALIGN16(const uint8, shuf0[16]) =
191  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
192
193// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
194extern "C" TALIGN16(const uint8, shuf1[16]) =
195  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
196
197// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
198extern "C" TALIGN16(const uint8, shuf2[16]) =
199  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
200
201// Offsets for source bytes 0 to 10
202extern "C" TALIGN16(const uint8, shuf01[16]) =
203  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
204
205// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
206extern "C" TALIGN16(const uint8, shuf11[16]) =
207  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
208
209// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
210extern "C" TALIGN16(const uint8, shuf21[16]) =
211  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
212
213// Coefficients for source bytes 0 to 10
214extern "C" TALIGN16(const uint8, madd01[16]) =
215  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
216
217// Coefficients for source bytes 10 to 21
218extern "C" TALIGN16(const uint8, madd11[16]) =
219  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
220
221// Coefficients for source bytes 21 to 31
222extern "C" TALIGN16(const uint8, madd21[16]) =
223  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
224
225// Coefficients for source bytes 21 to 31
226extern "C" TALIGN16(const int16, round34[8]) =
227  { 2, 2, 2, 2, 2, 2, 2, 2 };
228
229extern "C" TALIGN16(const uint8, shuf38a[16]) =
230  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
231
232extern "C" TALIGN16(const uint8, shuf38b[16]) =
233  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
234
235// Arrange words 0,3,6 into 0,1,2
236extern "C" TALIGN16(const uint8, shufac0[16]) =
237  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
238
239// Arrange words 0,3,6 into 3,4,5
240extern "C" TALIGN16(const uint8, shufac3[16]) =
241  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
242
243// Scaling values for boxes of 3x3 and 2x3
244extern "C" TALIGN16(const uint16, scaleac3[8]) =
245  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
246
247// Arrange first value for pixels 0,1,2,3,4,5
248extern "C" TALIGN16(const uint8, shufab0[16]) =
249  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
250
251// Arrange second value for pixels 0,1,2,3,4,5
252extern "C" TALIGN16(const uint8, shufab1[16]) =
253  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
254
255// Arrange third value for pixels 0,1,2,3,4,5
256extern "C" TALIGN16(const uint8, shufab2[16]) =
257  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
258
259// Scaling values for boxes of 3x2 and 2x2
260extern "C" TALIGN16(const uint16, scaleab2[8]) =
261  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
262#endif
263
264#if defined(WIN32) && !defined(COVERAGE_ENABLED)
265
266#define HAS_SCALEROWDOWN2_SSE2
267// Reads 32 pixels, throws half away and writes 16 pixels.
268// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
269__declspec(naked)
270static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
271                               uint8* dst_ptr, int dst_width) {
272  __asm {
273    mov        eax, [esp + 4]        // src_ptr
274                                     // src_stride ignored
275    mov        edx, [esp + 12]       // dst_ptr
276    mov        ecx, [esp + 16]       // dst_width
277    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
278    psrlw      xmm7, 8
279
280  wloop:
281    movdqa     xmm0, [eax]
282    movdqa     xmm1, [eax + 16]
283    lea        eax,  [eax + 32]
284    pand       xmm0, xmm7
285    pand       xmm1, xmm7
286    packuswb   xmm0, xmm1
287    movdqa     [edx], xmm0
288    lea        edx, [edx + 16]
289    sub        ecx, 16
290    ja         wloop
291
292    ret
293  }
294}
295// Blends 32x2 rectangle to 16x1.
296// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
297__declspec(naked)
298static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
299                                  uint8* dst_ptr, int dst_width) {
300  __asm {
301    push       esi
302    mov        eax, [esp + 4 + 4]    // src_ptr
303    mov        esi, [esp + 4 + 8]    // src_stride
304    mov        edx, [esp + 4 + 12]   // dst_ptr
305    mov        ecx, [esp + 4 + 16]   // dst_width
306    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
307    psrlw      xmm7, 8
308
309  wloop:
310    movdqa     xmm0, [eax]
311    movdqa     xmm1, [eax + 16]
312    movdqa     xmm2, [eax + esi]
313    movdqa     xmm3, [eax + esi + 16]
314    lea        eax,  [eax + 32]
315    pavgb      xmm0, xmm2            // average rows
316    pavgb      xmm1, xmm3
317
318    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
319    psrlw      xmm0, 8
320    movdqa     xmm3, xmm1
321    psrlw      xmm1, 8
322    pand       xmm2, xmm7
323    pand       xmm3, xmm7
324    pavgw      xmm0, xmm2
325    pavgw      xmm1, xmm3
326    packuswb   xmm0, xmm1
327
328    movdqa     [edx], xmm0
329    lea        edx, [edx + 16]
330    sub        ecx, 16
331    ja         wloop
332
333    pop        esi
334    ret
335  }
336}
337
338#define HAS_SCALEROWDOWN4_SSE2
339// Point samples 32 pixels to 8 pixels.
340// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
341__declspec(naked)
342static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
343                               uint8* dst_ptr, int dst_width) {
344  __asm {
345    pushad
346    mov        esi, [esp + 32 + 4]   // src_ptr
347                                     // src_stride ignored
348    mov        edi, [esp + 32 + 12]  // dst_ptr
349    mov        ecx, [esp + 32 + 16]  // dst_width
350    pcmpeqb    xmm7, xmm7            // generate mask 0x000000ff
351    psrld      xmm7, 24
352
353  wloop:
354    movdqa     xmm0, [esi]
355    movdqa     xmm1, [esi + 16]
356    lea        esi,  [esi + 32]
357    pand       xmm0, xmm7
358    pand       xmm1, xmm7
359    packuswb   xmm0, xmm1
360    packuswb   xmm0, xmm0
361    movq       qword ptr [edi], xmm0
362    lea        edi, [edi + 8]
363    sub        ecx, 8
364    ja         wloop
365
366    popad
367    ret
368  }
369}
370
371// Blends 32x4 rectangle to 8x1.
372// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
373__declspec(naked)
374static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
375                                  uint8* dst_ptr, int dst_width) {
376  __asm {
377    pushad
378    mov        esi, [esp + 32 + 4]   // src_ptr
379    mov        ebx, [esp + 32 + 8]   // src_stride
380    mov        edi, [esp + 32 + 12]  // dst_ptr
381    mov        ecx, [esp + 32 + 16]  // dst_width
382    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
383    psrlw      xmm7, 8
384    lea        edx, [ebx + ebx * 2]  // src_stride * 3
385
386  wloop:
387    movdqa     xmm0, [esi]
388    movdqa     xmm1, [esi + 16]
389    movdqa     xmm2, [esi + ebx]
390    movdqa     xmm3, [esi + ebx + 16]
391    pavgb      xmm0, xmm2            // average rows
392    pavgb      xmm1, xmm3
393    movdqa     xmm2, [esi + ebx * 2]
394    movdqa     xmm3, [esi + ebx * 2 + 16]
395    movdqa     xmm4, [esi + edx]
396    movdqa     xmm5, [esi + edx + 16]
397    lea        esi, [esi + 32]
398    pavgb      xmm2, xmm4
399    pavgb      xmm3, xmm5
400    pavgb      xmm0, xmm2
401    pavgb      xmm1, xmm3
402
403    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
404    psrlw      xmm0, 8
405    movdqa     xmm3, xmm1
406    psrlw      xmm1, 8
407    pand       xmm2, xmm7
408    pand       xmm3, xmm7
409    pavgw      xmm0, xmm2
410    pavgw      xmm1, xmm3
411    packuswb   xmm0, xmm1
412
413    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
414    psrlw      xmm0, 8
415    pand       xmm2, xmm7
416    pavgw      xmm0, xmm2
417    packuswb   xmm0, xmm0
418
419    movq       qword ptr [edi], xmm0
420    lea        edi, [edi + 8]
421    sub        ecx, 8
422    ja         wloop
423
424    popad
425    ret
426  }
427}
428
429#define HAS_SCALEROWDOWN8_SSE2
430// Point samples 32 pixels to 4 pixels.
431// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
432__declspec(naked)
433static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
434                               uint8* dst_ptr, int dst_width) {
435  __asm {
436    pushad
437    mov        esi, [esp + 32 + 4]   // src_ptr
438                                     // src_stride ignored
439    mov        edi, [esp + 32 + 12]  // dst_ptr
440    mov        ecx, [esp + 32 + 16]  // dst_width
441    pcmpeqb    xmm7, xmm7            // generate mask isolating 1 src 8 bytes
442    psrlq      xmm7, 56
443
444  wloop:
445    movdqa     xmm0, [esi]
446    movdqa     xmm1, [esi + 16]
447    lea        esi,  [esi + 32]
448    pand       xmm0, xmm7
449    pand       xmm1, xmm7
450    packuswb   xmm0, xmm1  // 32->16
451    packuswb   xmm0, xmm0  // 16->8
452    packuswb   xmm0, xmm0  // 8->4
453    movd       dword ptr [edi], xmm0
454    lea        edi, [edi + 4]
455    sub        ecx, 4
456    ja         wloop
457
458    popad
459    ret
460  }
461}
462
463// Blends 32x8 rectangle to 4x1.
464// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
465__declspec(naked)
466static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
467                                  uint8* dst_ptr, int dst_width) {
468  __asm {
469    pushad
470    mov        esi, [esp + 32 + 4]   // src_ptr
471    mov        ebx, [esp + 32 + 8]   // src_stride
472    mov        edi, [esp + 32 + 12]  // dst_ptr
473    mov        ecx, [esp + 32 + 16]  // dst_width
474    lea        edx, [ebx + ebx * 2]  // src_stride * 3
475    pxor       xmm7, xmm7
476
477  wloop:
478    movdqa     xmm0, [esi]           // average 8 rows to 1
479    movdqa     xmm1, [esi + 16]
480    movdqa     xmm2, [esi + ebx]
481    movdqa     xmm3, [esi + ebx + 16]
482    pavgb      xmm0, xmm2
483    pavgb      xmm1, xmm3
484    movdqa     xmm2, [esi + ebx * 2]
485    movdqa     xmm3, [esi + ebx * 2 + 16]
486    movdqa     xmm4, [esi + edx]
487    movdqa     xmm5, [esi + edx + 16]
488    lea        ebp, [esi + ebx * 4]
489    lea        esi, [esi + 32]
490    pavgb      xmm2, xmm4
491    pavgb      xmm3, xmm5
492    pavgb      xmm0, xmm2
493    pavgb      xmm1, xmm3
494
495    movdqa     xmm2, [ebp]
496    movdqa     xmm3, [ebp + 16]
497    movdqa     xmm4, [ebp + ebx]
498    movdqa     xmm5, [ebp + ebx + 16]
499    pavgb      xmm2, xmm4
500    pavgb      xmm3, xmm5
501    movdqa     xmm4, [ebp + ebx * 2]
502    movdqa     xmm5, [ebp + ebx * 2 + 16]
503    movdqa     xmm6, [ebp + edx]
504    pavgb      xmm4, xmm6
505    movdqa     xmm6, [ebp + edx + 16]
506    pavgb      xmm5, xmm6
507    pavgb      xmm2, xmm4
508    pavgb      xmm3, xmm5
509    pavgb      xmm0, xmm2
510    pavgb      xmm1, xmm3
511
512    psadbw     xmm0, xmm7            // average 32 pixels to 4
513    psadbw     xmm1, xmm7
514    pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
515    pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
516    por        xmm0, xmm1            //      -> 3201
517    psrlw      xmm0, 3
518    packuswb   xmm0, xmm0
519    packuswb   xmm0, xmm0
520    movd       dword ptr [edi], xmm0
521
522    lea        edi, [edi + 4]
523    sub        ecx, 4
524    ja         wloop
525
526    popad
527    ret
528  }
529}
530
531#define HAS_SCALEROWDOWN34_SSSE3
532// Point samples 32 pixels to 24 pixels.
533// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
534// Then shuffled to do the scaling.
535
536// Note that movdqa+palign may be better than movdqu.
537// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
538__declspec(naked)
539static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
540                                 uint8* dst_ptr, int dst_width) {
541  __asm {
542    pushad
543    mov        esi, [esp + 32 + 4]   // src_ptr
544                                     // src_stride ignored
545    mov        edi, [esp + 32 + 12]  // dst_ptr
546    mov        ecx, [esp + 32 + 16]  // dst_width
547    movdqa     xmm3, _shuf0
548    movdqa     xmm4, _shuf1
549    movdqa     xmm5, _shuf2
550
551  wloop:
552    movdqa     xmm0, [esi]
553    movdqa     xmm2, [esi + 16]
554    lea        esi,  [esi + 32]
555    movdqa     xmm1, xmm2
556    palignr    xmm1, xmm0, 8
557    pshufb     xmm0, xmm3
558    pshufb     xmm1, xmm4
559    pshufb     xmm2, xmm5
560    movq       qword ptr [edi], xmm0
561    movq       qword ptr [edi + 8], xmm1
562    movq       qword ptr [edi + 16], xmm2
563    lea        edi, [edi + 24]
564    sub        ecx, 24
565    ja         wloop
566
567    popad
568    ret
569  }
570}
571
572// Blends 32x2 rectangle to 24x1
573// Produces three 8 byte values.  For each 8 bytes, 16 bytes are read.
574// Then shuffled to do the scaling.
575
576// Register usage:
577// xmm0 src_row 0
578// xmm1 src_row 1
579// xmm2 shuf 0
580// xmm3 shuf 1
581// xmm4 shuf 2
582// xmm5 madd 0
583// xmm6 madd 1
584// xmm7 round34
585
586// Note that movdqa+palign may be better than movdqu.
587// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
588__declspec(naked)
589static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
590                                       uint8* dst_ptr, int dst_width) {
591  __asm {
592    pushad
593    mov        esi, [esp + 32 + 4]   // src_ptr
594    mov        ebx, [esp + 32 + 8]   // src_stride
595    mov        edi, [esp + 32 + 12]  // dst_ptr
596    mov        ecx, [esp + 32 + 16]  // dst_width
597    movdqa     xmm2, _shuf01
598    movdqa     xmm3, _shuf11
599    movdqa     xmm4, _shuf21
600    movdqa     xmm5, _madd01
601    movdqa     xmm6, _madd11
602    movdqa     xmm7, _round34
603
604  wloop:
605    movdqa     xmm0, [esi]           // pixels 0..7
606    movdqa     xmm1, [esi+ebx]
607    pavgb      xmm0, xmm1
608    pshufb     xmm0, xmm2
609    pmaddubsw  xmm0, xmm5
610    paddsw     xmm0, xmm7
611    psrlw      xmm0, 2
612    packuswb   xmm0, xmm0
613    movq       qword ptr [edi], xmm0
614    movdqu     xmm0, [esi+8]         // pixels 8..15
615    movdqu     xmm1, [esi+ebx+8]
616    pavgb      xmm0, xmm1
617    pshufb     xmm0, xmm3
618    pmaddubsw  xmm0, xmm6
619    paddsw     xmm0, xmm7
620    psrlw      xmm0, 2
621    packuswb   xmm0, xmm0
622    movq       qword ptr [edi+8], xmm0
623    movdqa     xmm0, [esi+16]        // pixels 16..23
624    movdqa     xmm1, [esi+ebx+16]
625    lea        esi, [esi+32]
626    pavgb      xmm0, xmm1
627    pshufb     xmm0, xmm4
628    movdqa     xmm1, _madd21
629    pmaddubsw  xmm0, xmm1
630    paddsw     xmm0, xmm7
631    psrlw      xmm0, 2
632    packuswb   xmm0, xmm0
633    movq       qword ptr [edi+16], xmm0
634    lea        edi, [edi+24]
635    sub        ecx, 24
636    ja         wloop
637
638    popad
639    ret
640  }
641}
642
643// Note that movdqa+palign may be better than movdqu.
644// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
645__declspec(naked)
646static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
647                                       uint8* dst_ptr, int dst_width) {
648  __asm {
649    pushad
650    mov        esi, [esp + 32 + 4]   // src_ptr
651    mov        ebx, [esp + 32 + 8]   // src_stride
652    mov        edi, [esp + 32 + 12]  // dst_ptr
653    mov        ecx, [esp + 32 + 16]  // dst_width
654    movdqa     xmm2, _shuf01
655    movdqa     xmm3, _shuf11
656    movdqa     xmm4, _shuf21
657    movdqa     xmm5, _madd01
658    movdqa     xmm6, _madd11
659    movdqa     xmm7, _round34
660
661  wloop:
662    movdqa     xmm0, [esi]           // pixels 0..7
663    movdqa     xmm1, [esi+ebx]
664    pavgb      xmm1, xmm0
665    pavgb      xmm0, xmm1
666    pshufb     xmm0, xmm2
667    pmaddubsw  xmm0, xmm5
668    paddsw     xmm0, xmm7
669    psrlw      xmm0, 2
670    packuswb   xmm0, xmm0
671    movq       qword ptr [edi], xmm0
672    movdqu     xmm0, [esi+8]         // pixels 8..15
673    movdqu     xmm1, [esi+ebx+8]
674    pavgb      xmm1, xmm0
675    pavgb      xmm0, xmm1
676    pshufb     xmm0, xmm3
677    pmaddubsw  xmm0, xmm6
678    paddsw     xmm0, xmm7
679    psrlw      xmm0, 2
680    packuswb   xmm0, xmm0
681    movq       qword ptr [edi+8], xmm0
682    movdqa     xmm0, [esi+16]        // pixels 16..23
683    movdqa     xmm1, [esi+ebx+16]
684    lea        esi, [esi+32]
685    pavgb      xmm1, xmm0
686    pavgb      xmm0, xmm1
687    pshufb     xmm0, xmm4
688    movdqa     xmm1, _madd21
689    pmaddubsw  xmm0, xmm1
690    paddsw     xmm0, xmm7
691    psrlw      xmm0, 2
692    packuswb   xmm0, xmm0
693    movq       qword ptr [edi+16], xmm0
694    lea        edi, [edi+24]
695    sub        ecx, 24
696    ja         wloop
697
698    popad
699    ret
700  }
701}
702
703#define HAS_SCALEROWDOWN38_SSSE3
704// 3/8 point sampler
705
706// Scale 32 pixels to 12
707__declspec(naked)
708static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
709                                 uint8* dst_ptr, int dst_width) {
710  __asm {
711    pushad
712    mov        esi, [esp + 32 + 4]   // src_ptr
713    mov        edx, [esp + 32 + 8]   // src_stride
714    mov        edi, [esp + 32 + 12]  // dst_ptr
715    mov        ecx, [esp + 32 + 16]  // dst_width
716    movdqa     xmm5, _shuf38a
717    movdqa     xmm6, _shuf38b
718    pxor       xmm7, xmm7
719
720  xloop:
721    movdqa     xmm0, [esi]           // 16 pixels -> 0,1,2,3,4,5
722    movdqa     xmm1, [esi + 16]      // 16 pixels -> 6,7,8,9,10,11
723    lea        esi, [esi + 32]
724    pshufb     xmm0, xmm5
725    pshufb     xmm1, xmm6
726    paddusb    xmm0, xmm1
727
728    movq       qword ptr [edi], xmm0 // write 12 pixels
729    movhlps    xmm1, xmm0
730    movd       [edi + 8], xmm1
731    lea        edi, [edi + 12]
732    sub        ecx, 12
733    ja         xloop
734
735    popad
736    ret
737  }
738}
739
740// Scale 16x3 pixels to 6x1 with interpolation
741__declspec(naked)
742static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
743                                       uint8* dst_ptr, int dst_width) {
744  __asm {
745    pushad
746    mov        esi, [esp + 32 + 4]   // src_ptr
747    mov        edx, [esp + 32 + 8]   // src_stride
748    mov        edi, [esp + 32 + 12]  // dst_ptr
749    mov        ecx, [esp + 32 + 16]  // dst_width
750    movdqa     xmm4, _shufac0
751    movdqa     xmm5, _shufac3
752    movdqa     xmm6, _scaleac3
753    pxor       xmm7, xmm7
754
755  xloop:
756    movdqa     xmm0, [esi]           // sum up 3 rows into xmm0/1
757    movdqa     xmm2, [esi + edx]
758    movhlps    xmm1, xmm0
759    movhlps    xmm3, xmm2
760    punpcklbw  xmm0, xmm7
761    punpcklbw  xmm1, xmm7
762    punpcklbw  xmm2, xmm7
763    punpcklbw  xmm3, xmm7
764    paddusw    xmm0, xmm2
765    paddusw    xmm1, xmm3
766    movdqa     xmm2, [esi + edx * 2]
767    lea        esi, [esi + 16]
768    movhlps    xmm3, xmm2
769    punpcklbw  xmm2, xmm7
770    punpcklbw  xmm3, xmm7
771    paddusw    xmm0, xmm2
772    paddusw    xmm1, xmm3
773
774    movdqa     xmm2, xmm0            // 8 pixels -> 0,1,2 of xmm2
775    psrldq     xmm0, 2
776    paddusw    xmm2, xmm0
777    psrldq     xmm0, 2
778    paddusw    xmm2, xmm0
779    pshufb     xmm2, xmm4
780
781    movdqa     xmm3, xmm1            // 8 pixels -> 3,4,5 of xmm2
782    psrldq     xmm1, 2
783    paddusw    xmm3, xmm1
784    psrldq     xmm1, 2
785    paddusw    xmm3, xmm1
786    pshufb     xmm3, xmm5
787    paddusw    xmm2, xmm3
788
789    pmulhuw    xmm2, xmm6            // divide by 9,9,6, 9,9,6
790    packuswb   xmm2, xmm2
791
792    movd       [edi], xmm2           // write 6 pixels
793    pextrw     eax, xmm2, 2
794    mov        [edi + 4], ax
795    lea        edi, [edi + 6]
796    sub        ecx, 6
797    ja         xloop
798
799    popad
800    ret
801  }
802}
803
804// Scale 16x2 pixels to 6x1 with interpolation
805__declspec(naked)
806static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
807                                       uint8* dst_ptr, int dst_width) {
808  __asm {
809    pushad
810    mov        esi, [esp + 32 + 4]   // src_ptr
811    mov        edx, [esp + 32 + 8]   // src_stride
812    mov        edi, [esp + 32 + 12]  // dst_ptr
813    mov        ecx, [esp + 32 + 16]  // dst_width
814    movdqa     xmm4, _shufab0
815    movdqa     xmm5, _shufab1
816    movdqa     xmm6, _shufab2
817    movdqa     xmm7, _scaleab2
818
819  xloop:
820    movdqa     xmm2, [esi]           // average 2 rows into xmm2
821    pavgb      xmm2, [esi + edx]
822    lea        esi, [esi + 16]
823
824    movdqa     xmm0, xmm2            // 16 pixels -> 0,1,2,3,4,5 of xmm0
825    pshufb     xmm0, xmm4
826    movdqa     xmm1, xmm2
827    pshufb     xmm1, xmm5
828    paddusw    xmm0, xmm1
829    pshufb     xmm2, xmm6
830    paddusw    xmm0, xmm2
831
832    pmulhuw    xmm0, xmm7            // divide by 3,3,2, 3,3,2
833    packuswb   xmm0, xmm0
834
835    movd       [edi], xmm0           // write 6 pixels
836    pextrw     eax, xmm0, 2
837    mov        [edi + 4], ax
838    lea        edi, [edi + 6]
839    sub        ecx, 6
840    ja         xloop
841
842    popad
843    ret
844  }
845}
846
847#define HAS_SCALEADDROWS_SSE2
848
849// Reads 8xN bytes and produces 16 shorts at a time.
850__declspec(naked)
851static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
852                              uint16* dst_ptr, int src_width,
853                              int src_height) {
854  __asm {
855    pushad
856    mov        esi, [esp + 32 + 4]   // src_ptr
857    mov        edx, [esp + 32 + 8]   // src_stride
858    mov        edi, [esp + 32 + 12]  // dst_ptr
859    mov        ecx, [esp + 32 + 16]  // dst_width
860    mov        ebx, [esp + 32 + 20]  // height
861    pxor       xmm7, xmm7
862    dec        ebx
863
864  xloop:
865    // first row
866    movdqa     xmm2, [esi]
867    lea        eax, [esi + edx]
868    movhlps    xmm3, xmm2
869    mov        ebp, ebx
870    punpcklbw  xmm2, xmm7
871    punpcklbw  xmm3, xmm7
872
873    // sum remaining rows
874  yloop:
875    movdqa     xmm0, [eax]       // read 16 pixels
876    lea        eax, [eax + edx]  // advance to next row
877    movhlps    xmm1, xmm0
878    punpcklbw  xmm0, xmm7
879    punpcklbw  xmm1, xmm7
880    paddusw    xmm2, xmm0        // sum 16 words
881    paddusw    xmm3, xmm1
882    sub        ebp, 1
883    ja         yloop
884
885    movdqa     [edi], xmm2
886    movdqa     [edi + 16], xmm3
887    lea        edi, [edi + 32]
888    lea        esi, [esi + 16]
889
890    sub        ecx, 16
891    ja         xloop
892
893    popad
894    ret
895  }
896}
897
898// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
899#define HAS_SCALEFILTERROWS_SSE2
900__declspec(naked)
901static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
902                                 int src_stride, int dst_width,
903                                 int source_y_fraction) {
904  __asm {
905    push       esi
906    push       edi
907    mov        edi, [esp + 8 + 4]   // dst_ptr
908    mov        esi, [esp + 8 + 8]   // src_ptr
909    mov        edx, [esp + 8 + 12]  // src_stride
910    mov        ecx, [esp + 8 + 16]  // dst_width
911    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
912    cmp        eax, 0
913    je         xloop1
914    cmp        eax, 128
915    je         xloop2
916
917    movd       xmm6, eax            // xmm6 = y fraction
918    punpcklwd  xmm6, xmm6
919    pshufd     xmm6, xmm6, 0
920    neg        eax                  // xmm5 = 256 - y fraction
921    add        eax, 256
922    movd       xmm5, eax
923    punpcklwd  xmm5, xmm5
924    pshufd     xmm5, xmm5, 0
925    pxor       xmm7, xmm7
926
927  xloop:
928    movdqa     xmm0, [esi]
929    movdqa     xmm2, [esi + edx]
930    lea        esi, [esi + 16]
931    movdqa     xmm1, xmm0
932    movdqa     xmm3, xmm2
933    punpcklbw  xmm0, xmm7
934    punpcklbw  xmm2, xmm7
935    punpckhbw  xmm1, xmm7
936    punpckhbw  xmm3, xmm7
937    pmullw     xmm0, xmm5           // scale row 0
938    pmullw     xmm1, xmm5
939    pmullw     xmm2, xmm6           // scale row 1
940    pmullw     xmm3, xmm6
941    paddusw    xmm0, xmm2           // sum rows
942    paddusw    xmm1, xmm3
943    psrlw      xmm0, 8
944    psrlw      xmm1, 8
945    packuswb   xmm0, xmm1
946    movdqa     [edi], xmm0
947    lea        edi, [edi + 16]
948    sub        ecx, 16
949    ja         xloop
950
951    mov        al, [edi - 1]
952    mov        [edi], al
953    pop        edi
954    pop        esi
955    ret
956
957  xloop1:
958    movdqa     xmm0, [esi]
959    lea        esi, [esi + 16]
960    movdqa     [edi], xmm0
961    lea        edi, [edi + 16]
962    sub        ecx, 16
963    ja         xloop1
964
965    mov        al, [edi - 1]
966    mov        [edi], al
967    pop        edi
968    pop        esi
969    ret
970
971  xloop2:
972    movdqa     xmm0, [esi]
973    movdqa     xmm2, [esi + edx]
974    lea        esi, [esi + 16]
975    pavgb      xmm0, xmm2
976    movdqa     [edi], xmm0
977    lea        edi, [edi + 16]
978    sub        ecx, 16
979    ja         xloop2
980
981    mov        al, [edi - 1]
982    mov        [edi], al
983    pop        edi
984    pop        esi
985    ret
986  }
987}
988
989// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
990#define HAS_SCALEFILTERROWS_SSSE3
991__declspec(naked)
992static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
993                                  int src_stride, int dst_width,
994                                  int source_y_fraction) {
995  __asm {
996    push       esi
997    push       edi
998    mov        edi, [esp + 8 + 4]   // dst_ptr
999    mov        esi, [esp + 8 + 8]   // src_ptr
1000    mov        edx, [esp + 8 + 12]  // src_stride
1001    mov        ecx, [esp + 8 + 16]  // dst_width
1002    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
1003    cmp        eax, 0
1004    je         xloop1
1005    cmp        eax, 128
1006    je         xloop2
1007
1008    shr        eax, 1
1009    mov        ah,al
1010    neg        al
1011    add        al, 128
1012    movd       xmm7, eax
1013    punpcklwd  xmm7, xmm7
1014    pshufd     xmm7, xmm7, 0
1015
1016  xloop:
1017    movdqa     xmm0, [esi]
1018    movdqa     xmm2, [esi + edx]
1019    lea        esi, [esi + 16]
1020    movdqa     xmm1, xmm0
1021    punpcklbw  xmm0, xmm2
1022    punpckhbw  xmm1, xmm2
1023    pmaddubsw  xmm0, xmm7
1024    pmaddubsw  xmm1, xmm7
1025    psrlw      xmm0, 7
1026    psrlw      xmm1, 7
1027    packuswb   xmm0, xmm1
1028    movdqa     [edi], xmm0
1029    lea        edi, [edi + 16]
1030    sub        ecx, 16
1031    ja         xloop
1032
1033    mov        al, [edi - 1]
1034    mov        [edi], al
1035    pop        edi
1036    pop        esi
1037    ret
1038
1039  xloop1:
1040    movdqa     xmm0, [esi]
1041    lea        esi, [esi + 16]
1042    movdqa     [edi], xmm0
1043    lea        edi, [edi + 16]
1044    sub        ecx, 16
1045    ja         xloop1
1046
1047    mov        al, [edi - 1]
1048    mov        [edi], al
1049    pop        edi
1050    pop        esi
1051    ret
1052
1053  xloop2:
1054    movdqa     xmm0, [esi]
1055    movdqa     xmm2, [esi + edx]
1056    lea        esi, [esi + 16]
1057    pavgb      xmm0, xmm2
1058    movdqa     [edi], xmm0
1059    lea        edi, [edi + 16]
1060    sub        ecx, 16
1061    ja         xloop2
1062
1063    mov        al, [edi - 1]
1064    mov        [edi], al
1065    pop        edi
1066    pop        esi
1067    ret
1068
1069  }
1070}
1071
1072// Note that movdqa+palign may be better than movdqu.
1073// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
1074__declspec(naked)
1075static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1076                                    int dst_width) {
1077  __asm {
1078    mov        edx, [esp + 4]    // dst_ptr
1079    mov        eax, [esp + 8]    // src_ptr
1080    mov        ecx, [esp + 12]   // dst_width
1081    movdqa     xmm1, _round34
1082    movdqa     xmm2, _shuf01
1083    movdqa     xmm3, _shuf11
1084    movdqa     xmm4, _shuf21
1085    movdqa     xmm5, _madd01
1086    movdqa     xmm6, _madd11
1087    movdqa     xmm7, _madd21
1088
1089  wloop:
1090    movdqa     xmm0, [eax]           // pixels 0..7
1091    pshufb     xmm0, xmm2
1092    pmaddubsw  xmm0, xmm5
1093    paddsw     xmm0, xmm1
1094    psrlw      xmm0, 2
1095    packuswb   xmm0, xmm0
1096    movq       qword ptr [edx], xmm0
1097    movdqu     xmm0, [eax+8]         // pixels 8..15
1098    pshufb     xmm0, xmm3
1099    pmaddubsw  xmm0, xmm6
1100    paddsw     xmm0, xmm1
1101    psrlw      xmm0, 2
1102    packuswb   xmm0, xmm0
1103    movq       qword ptr [edx+8], xmm0
1104    movdqa     xmm0, [eax+16]        // pixels 16..23
1105    lea        eax, [eax+32]
1106    pshufb     xmm0, xmm4
1107    pmaddubsw  xmm0, xmm7
1108    paddsw     xmm0, xmm1
1109    psrlw      xmm0, 2
1110    packuswb   xmm0, xmm0
1111    movq       qword ptr [edx+16], xmm0
1112    lea        edx, [edx+24]
1113    sub        ecx, 24
1114    ja         wloop
1115    ret
1116  }
1117}
1118
1119#elif (defined(__x86_64__) || defined(__i386__)) && \
1120    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
1121
1122// GCC versions of row functions are verbatim conversions from Visual C.
1123// Generated using gcc disassembly on Visual C object file:
1124// objdump -D yuvscaler.obj >yuvscaler.txt
1125#define HAS_SCALEROWDOWN2_SSE2
1126static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
1127                               uint8* dst_ptr, int dst_width) {
1128  asm volatile(
1129  "pcmpeqb    %%xmm7,%%xmm7\n"
1130  "psrlw      $0x8,%%xmm7\n"
1131"1:"
1132  "movdqa     (%0),%%xmm0\n"
1133  "movdqa     0x10(%0),%%xmm1\n"
1134  "lea        0x20(%0),%0\n"
1135  "pand       %%xmm7,%%xmm0\n"
1136  "pand       %%xmm7,%%xmm1\n"
1137  "packuswb   %%xmm1,%%xmm0\n"
1138  "movdqa     %%xmm0,(%1)\n"
1139  "lea        0x10(%1),%1\n"
1140  "sub        $0x10,%2\n"
1141  "ja         1b\n"
1142  : "+r"(src_ptr),    // %0
1143    "+r"(dst_ptr),    // %1
1144    "+r"(dst_width)   // %2
1145  :
1146  : "memory"
1147);
1148}
1149
1150static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
1151                                  uint8* dst_ptr, int dst_width) {
1152  asm volatile(
1153  "pcmpeqb    %%xmm7,%%xmm7\n"
1154  "psrlw      $0x8,%%xmm7\n"
1155"1:"
1156  "movdqa     (%0),%%xmm0\n"
1157  "movdqa     0x10(%0),%%xmm1\n"
1158  "movdqa     (%0,%3,1),%%xmm2\n"
1159  "movdqa     0x10(%0,%3,1),%%xmm3\n"
1160  "lea        0x20(%0),%0\n"
1161  "pavgb      %%xmm2,%%xmm0\n"
1162  "pavgb      %%xmm3,%%xmm1\n"
1163  "movdqa     %%xmm0,%%xmm2\n"
1164  "psrlw      $0x8,%%xmm0\n"
1165  "movdqa     %%xmm1,%%xmm3\n"
1166  "psrlw      $0x8,%%xmm1\n"
1167  "pand       %%xmm7,%%xmm2\n"
1168  "pand       %%xmm7,%%xmm3\n"
1169  "pavgw      %%xmm2,%%xmm0\n"
1170  "pavgw      %%xmm3,%%xmm1\n"
1171  "packuswb   %%xmm1,%%xmm0\n"
1172  "movdqa     %%xmm0,(%1)\n"
1173  "lea        0x10(%1),%1\n"
1174  "sub        $0x10,%2\n"
1175  "ja         1b\n"
1176  : "+r"(src_ptr),    // %0
1177    "+r"(dst_ptr),    // %1
1178    "+r"(dst_width)   // %2
1179  : "r"(static_cast<intptr_t>(src_stride))   // %3
1180  : "memory"
1181);
1182}
1183
1184#define HAS_SCALEROWDOWN4_SSE2
1185static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
1186                               uint8* dst_ptr, int dst_width) {
1187  asm volatile(
1188  "pcmpeqb    %%xmm7,%%xmm7\n"
1189  "psrld      $0x18,%%xmm7\n"
1190"1:"
1191  "movdqa     (%0),%%xmm0\n"
1192  "movdqa     0x10(%0),%%xmm1\n"
1193  "lea        0x20(%0),%0\n"
1194  "pand       %%xmm7,%%xmm0\n"
1195  "pand       %%xmm7,%%xmm1\n"
1196  "packuswb   %%xmm1,%%xmm0\n"
1197  "packuswb   %%xmm0,%%xmm0\n"
1198  "movq       %%xmm0,(%1)\n"
1199  "lea        0x8(%1),%1\n"
1200  "sub        $0x8,%2\n"
1201  "ja         1b\n"
1202  : "+r"(src_ptr),    // %0
1203    "+r"(dst_ptr),    // %1
1204    "+r"(dst_width)   // %2
1205  :
1206  : "memory"
1207);
1208}
1209
1210static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
1211                                  uint8* dst_ptr, int dst_width) {
1212  intptr_t temp = 0;
1213  asm volatile(
1214  "pcmpeqb    %%xmm7,%%xmm7\n"
1215  "psrlw      $0x8,%%xmm7\n"
1216  "lea        (%4,%4,2),%3\n"
1217"1:"
1218  "movdqa     (%0),%%xmm0\n"
1219  "movdqa     0x10(%0),%%xmm1\n"
1220  "movdqa     (%0,%4,1),%%xmm2\n"
1221  "movdqa     0x10(%0,%4,1),%%xmm3\n"
1222  "pavgb      %%xmm2,%%xmm0\n"
1223  "pavgb      %%xmm3,%%xmm1\n"
1224  "movdqa     (%0,%4,2),%%xmm2\n"
1225  "movdqa     0x10(%0,%4,2),%%xmm3\n"
1226  "movdqa     (%0,%3,1),%%xmm4\n"
1227  "movdqa     0x10(%0,%3,1),%%xmm5\n"
1228  "lea        0x20(%0),%0\n"
1229  "pavgb      %%xmm4,%%xmm2\n"
1230  "pavgb      %%xmm2,%%xmm0\n"
1231  "pavgb      %%xmm5,%%xmm3\n"
1232  "pavgb      %%xmm3,%%xmm1\n"
1233  "movdqa     %%xmm0,%%xmm2\n"
1234  "psrlw      $0x8,%%xmm0\n"
1235  "movdqa     %%xmm1,%%xmm3\n"
1236  "psrlw      $0x8,%%xmm1\n"
1237  "pand       %%xmm7,%%xmm2\n"
1238  "pand       %%xmm7,%%xmm3\n"
1239  "pavgw      %%xmm2,%%xmm0\n"
1240  "pavgw      %%xmm3,%%xmm1\n"
1241  "packuswb   %%xmm1,%%xmm0\n"
1242  "movdqa     %%xmm0,%%xmm2\n"
1243  "psrlw      $0x8,%%xmm0\n"
1244  "pand       %%xmm7,%%xmm2\n"
1245  "pavgw      %%xmm2,%%xmm0\n"
1246  "packuswb   %%xmm0,%%xmm0\n"
1247  "movq       %%xmm0,(%1)\n"
1248  "lea        0x8(%1),%1\n"
1249  "sub        $0x8,%2\n"
1250  "ja         1b\n"
1251  : "+r"(src_ptr),     // %0
1252    "+r"(dst_ptr),     // %1
1253    "+r"(dst_width),   // %2
1254    "+r"(temp)         // %3
1255  : "r"(static_cast<intptr_t>(src_stride))    // %4
1256  : "memory"
1257);
1258}
1259
1260#define HAS_SCALEROWDOWN8_SSE2
1261static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
1262                               uint8* dst_ptr, int dst_width) {
1263  asm volatile(
1264  "pcmpeqb    %%xmm7,%%xmm7\n"
1265  "psrlq      $0x38,%%xmm7\n"
1266"1:"
1267  "movdqa     (%0),%%xmm0\n"
1268  "movdqa     0x10(%0),%%xmm1\n"
1269  "lea        0x20(%0),%0\n"
1270  "pand       %%xmm7,%%xmm0\n"
1271  "pand       %%xmm7,%%xmm1\n"
1272  "packuswb   %%xmm1,%%xmm0\n"
1273  "packuswb   %%xmm0,%%xmm0\n"
1274  "packuswb   %%xmm0,%%xmm0\n"
1275  "movd       %%xmm0,(%1)\n"
1276  "lea        0x4(%1),%1\n"
1277  "sub        $0x4,%2\n"
1278  "ja         1b\n"
1279  : "+r"(src_ptr),    // %0
1280    "+r"(dst_ptr),    // %1
1281    "+r"(dst_width)   // %2
1282  :
1283  : "memory"
1284);
1285}
1286
1287#if defined(__i386__)
1288extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
1289                                      uint8* dst_ptr, int dst_width);
1290  asm(
1291    ".text\n"
1292#if defined(OSX)
1293    ".globl _ScaleRowDown8Int_SSE2\n"
1294"_ScaleRowDown8Int_SSE2:\n"
1295#else
1296    ".global ScaleRowDown8Int_SSE2\n"
1297"ScaleRowDown8Int_SSE2:\n"
1298#endif
1299    "pusha\n"
1300    "mov    0x24(%esp),%esi\n"
1301    "mov    0x28(%esp),%ebx\n"
1302    "mov    0x2c(%esp),%edi\n"
1303    "mov    0x30(%esp),%ecx\n"
1304    "lea    (%ebx,%ebx,2),%edx\n"
1305    "pxor   %xmm7,%xmm7\n"
1306
1307"1:"
1308    "movdqa (%esi),%xmm0\n"
1309    "movdqa 0x10(%esi),%xmm1\n"
1310    "movdqa (%esi,%ebx,1),%xmm2\n"
1311    "movdqa 0x10(%esi,%ebx,1),%xmm3\n"
1312    "pavgb  %xmm2,%xmm0\n"
1313    "pavgb  %xmm3,%xmm1\n"
1314    "movdqa (%esi,%ebx,2),%xmm2\n"
1315    "movdqa 0x10(%esi,%ebx,2),%xmm3\n"
1316    "movdqa (%esi,%edx,1),%xmm4\n"
1317    "movdqa 0x10(%esi,%edx,1),%xmm5\n"
1318    "lea    (%esi,%ebx,4),%ebp\n"
1319    "lea    0x20(%esi),%esi\n"
1320    "pavgb  %xmm4,%xmm2\n"
1321    "pavgb  %xmm5,%xmm3\n"
1322    "pavgb  %xmm2,%xmm0\n"
1323    "pavgb  %xmm3,%xmm1\n"
1324    "movdqa 0x0(%ebp),%xmm2\n"
1325    "movdqa 0x10(%ebp),%xmm3\n"
1326    "movdqa 0x0(%ebp,%ebx,1),%xmm4\n"
1327    "movdqa 0x10(%ebp,%ebx,1),%xmm5\n"
1328    "pavgb  %xmm4,%xmm2\n"
1329    "pavgb  %xmm5,%xmm3\n"
1330    "movdqa 0x0(%ebp,%ebx,2),%xmm4\n"
1331    "movdqa 0x10(%ebp,%ebx,2),%xmm5\n"
1332    "movdqa 0x0(%ebp,%edx,1),%xmm6\n"
1333    "pavgb  %xmm6,%xmm4\n"
1334    "movdqa 0x10(%ebp,%edx,1),%xmm6\n"
1335    "pavgb  %xmm6,%xmm5\n"
1336    "pavgb  %xmm4,%xmm2\n"
1337    "pavgb  %xmm5,%xmm3\n"
1338    "pavgb  %xmm2,%xmm0\n"
1339    "pavgb  %xmm3,%xmm1\n"
1340    "psadbw %xmm7,%xmm0\n"
1341    "psadbw %xmm7,%xmm1\n"
1342    "pshufd $0xd8,%xmm0,%xmm0\n"
1343    "pshufd $0x8d,%xmm1,%xmm1\n"
1344    "por    %xmm1,%xmm0\n"
1345    "psrlw  $0x3,%xmm0\n"
1346    "packuswb %xmm0,%xmm0\n"
1347    "packuswb %xmm0,%xmm0\n"
1348    "movd   %xmm0,(%edi)\n"
1349    "lea    0x4(%edi),%edi\n"
1350    "sub    $0x4,%ecx\n"
1351    "ja     1b\n"
1352    "popa\n"
1353    "ret\n"
1354);
1355
1356// fpic is used for magiccam plugin
1357#if !defined(__PIC__)
1358#define HAS_SCALEROWDOWN34_SSSE3
1359extern "C" void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
1360                                     uint8* dst_ptr, int dst_width);
1361  asm(
1362    ".text\n"
1363#if defined(OSX)
1364    ".globl _ScaleRowDown34_SSSE3\n"
1365"_ScaleRowDown34_SSSE3:\n"
1366#else
1367    ".global ScaleRowDown34_SSSE3\n"
1368"ScaleRowDown34_SSSE3:\n"
1369#endif
1370    "pusha\n"
1371    "mov    0x24(%esp),%esi\n"
1372    "mov    0x2c(%esp),%edi\n"
1373    "mov    0x30(%esp),%ecx\n"
1374    "movdqa _shuf0,%xmm3\n"
1375    "movdqa _shuf1,%xmm4\n"
1376    "movdqa _shuf2,%xmm5\n"
1377
1378"1:"
1379    "movdqa (%esi),%xmm0\n"
1380    "movdqa 0x10(%esi),%xmm2\n"
1381    "lea    0x20(%esi),%esi\n"
1382    "movdqa %xmm2,%xmm1\n"
1383    "palignr $0x8,%xmm0,%xmm1\n"
1384    "pshufb %xmm3,%xmm0\n"
1385    "pshufb %xmm4,%xmm1\n"
1386    "pshufb %xmm5,%xmm2\n"
1387    "movq   %xmm0,(%edi)\n"
1388    "movq   %xmm1,0x8(%edi)\n"
1389    "movq   %xmm2,0x10(%edi)\n"
1390    "lea    0x18(%edi),%edi\n"
1391    "sub    $0x18,%ecx\n"
1392    "ja     1b\n"
1393    "popa\n"
1394    "ret\n"
1395);
1396
1397extern "C" void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
1398                                           uint8* dst_ptr, int dst_width);
1399  asm(
1400    ".text\n"
1401#if defined(OSX)
1402    ".globl _ScaleRowDown34_1_Int_SSSE3\n"
1403"_ScaleRowDown34_1_Int_SSSE3:\n"
1404#else
1405    ".global ScaleRowDown34_1_Int_SSSE3\n"
1406"ScaleRowDown34_1_Int_SSSE3:\n"
1407#endif
1408    "pusha\n"
1409    "mov    0x24(%esp),%esi\n"
1410    "mov    0x28(%esp),%ebp\n"
1411    "mov    0x2c(%esp),%edi\n"
1412    "mov    0x30(%esp),%ecx\n"
1413    "movdqa _shuf01,%xmm2\n"
1414    "movdqa _shuf11,%xmm3\n"
1415    "movdqa _shuf21,%xmm4\n"
1416    "movdqa _madd01,%xmm5\n"
1417    "movdqa _madd11,%xmm6\n"
1418    "movdqa _round34,%xmm7\n"
1419
1420"1:"
1421    "movdqa (%esi),%xmm0\n"
1422    "movdqa (%esi,%ebp),%xmm1\n"
1423    "pavgb  %xmm1,%xmm0\n"
1424    "pshufb %xmm2,%xmm0\n"
1425    "pmaddubsw %xmm5,%xmm0\n"
1426    "paddsw %xmm7,%xmm0\n"
1427    "psrlw  $0x2,%xmm0\n"
1428    "packuswb %xmm0,%xmm0\n"
1429    "movq   %xmm0,(%edi)\n"
1430    "movdqu 0x8(%esi),%xmm0\n"
1431    "movdqu 0x8(%esi,%ebp),%xmm1\n"
1432    "pavgb  %xmm1,%xmm0\n"
1433    "pshufb %xmm3,%xmm0\n"
1434    "pmaddubsw %xmm6,%xmm0\n"
1435    "paddsw %xmm7,%xmm0\n"
1436    "psrlw  $0x2,%xmm0\n"
1437    "packuswb %xmm0,%xmm0\n"
1438    "movq   %xmm0,0x8(%edi)\n"
1439    "movdqa 0x10(%esi),%xmm0\n"
1440    "movdqa 0x10(%esi,%ebp),%xmm1\n"
1441    "lea    0x20(%esi),%esi\n"
1442    "pavgb  %xmm1,%xmm0\n"
1443    "pshufb %xmm4,%xmm0\n"
1444    "movdqa  _madd21,%xmm1\n"
1445    "pmaddubsw %xmm1,%xmm0\n"
1446    "paddsw %xmm7,%xmm0\n"
1447    "psrlw  $0x2,%xmm0\n"
1448    "packuswb %xmm0,%xmm0\n"
1449    "movq   %xmm0,0x10(%edi)\n"
1450    "lea    0x18(%edi),%edi\n"
1451    "sub    $0x18,%ecx\n"
1452    "ja     1b\n"
1453
1454    "popa\n"
1455    "ret\n"
1456);
1457
1458extern "C" void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
1459                                           uint8* dst_ptr, int dst_width);
1460  asm(
1461    ".text\n"
1462#if defined(OSX)
1463    ".globl _ScaleRowDown34_0_Int_SSSE3\n"
1464"_ScaleRowDown34_0_Int_SSSE3:\n"
1465#else
1466    ".global ScaleRowDown34_0_Int_SSSE3\n"
1467"ScaleRowDown34_0_Int_SSSE3:\n"
1468#endif
1469    "pusha\n"
1470    "mov    0x24(%esp),%esi\n"
1471    "mov    0x28(%esp),%ebp\n"
1472    "mov    0x2c(%esp),%edi\n"
1473    "mov    0x30(%esp),%ecx\n"
1474    "movdqa _shuf01,%xmm2\n"
1475    "movdqa _shuf11,%xmm3\n"
1476    "movdqa _shuf21,%xmm4\n"
1477    "movdqa _madd01,%xmm5\n"
1478    "movdqa _madd11,%xmm6\n"
1479    "movdqa _round34,%xmm7\n"
1480
1481"1:"
1482    "movdqa (%esi),%xmm0\n"
1483    "movdqa (%esi,%ebp,1),%xmm1\n"
1484    "pavgb  %xmm0,%xmm1\n"
1485    "pavgb  %xmm1,%xmm0\n"
1486    "pshufb %xmm2,%xmm0\n"
1487    "pmaddubsw %xmm5,%xmm0\n"
1488    "paddsw %xmm7,%xmm0\n"
1489    "psrlw  $0x2,%xmm0\n"
1490    "packuswb %xmm0,%xmm0\n"
1491    "movq   %xmm0,(%edi)\n"
1492    "movdqu 0x8(%esi),%xmm0\n"
1493    "movdqu 0x8(%esi,%ebp,1),%xmm1\n"
1494    "pavgb  %xmm0,%xmm1\n"
1495    "pavgb  %xmm1,%xmm0\n"
1496    "pshufb %xmm3,%xmm0\n"
1497    "pmaddubsw %xmm6,%xmm0\n"
1498    "paddsw %xmm7,%xmm0\n"
1499    "psrlw  $0x2,%xmm0\n"
1500    "packuswb %xmm0,%xmm0\n"
1501    "movq   %xmm0,0x8(%edi)\n"
1502    "movdqa 0x10(%esi),%xmm0\n"
1503    "movdqa 0x10(%esi,%ebp,1),%xmm1\n"
1504    "lea    0x20(%esi),%esi\n"
1505    "pavgb  %xmm0,%xmm1\n"
1506    "pavgb  %xmm1,%xmm0\n"
1507    "pshufb %xmm4,%xmm0\n"
1508    "movdqa  _madd21,%xmm1\n"
1509    "pmaddubsw %xmm1,%xmm0\n"
1510    "paddsw %xmm7,%xmm0\n"
1511    "psrlw  $0x2,%xmm0\n"
1512    "packuswb %xmm0,%xmm0\n"
1513    "movq   %xmm0,0x10(%edi)\n"
1514    "lea    0x18(%edi),%edi\n"
1515    "sub    $0x18,%ecx\n"
1516    "ja     1b\n"
1517    "popa\n"
1518    "ret\n"
1519);
1520
1521#define HAS_SCALEROWDOWN38_SSSE3
1522extern "C" void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
1523                                     uint8* dst_ptr, int dst_width);
1524  asm(
1525    ".text\n"
1526#if defined(OSX)
1527    ".globl _ScaleRowDown38_SSSE3\n"
1528"_ScaleRowDown38_SSSE3:\n"
1529#else
1530    ".global ScaleRowDown38_SSSE3\n"
1531"ScaleRowDown38_SSSE3:\n"
1532#endif
1533    "pusha\n"
1534    "mov    0x24(%esp),%esi\n"
1535    "mov    0x28(%esp),%edx\n"
1536    "mov    0x2c(%esp),%edi\n"
1537    "mov    0x30(%esp),%ecx\n"
1538    "movdqa _shuf38a ,%xmm5\n"
1539    "movdqa _shuf38b ,%xmm6\n"
1540    "pxor   %xmm7,%xmm7\n"
1541
1542"1:"
1543    "movdqa (%esi),%xmm0\n"
1544    "movdqa 0x10(%esi),%xmm1\n"
1545    "lea    0x20(%esi),%esi\n"
1546    "pshufb %xmm5,%xmm0\n"
1547    "pshufb %xmm6,%xmm1\n"
1548    "paddusb %xmm1,%xmm0\n"
1549    "movq   %xmm0,(%edi)\n"
1550    "movhlps %xmm0,%xmm1\n"
1551    "movd   %xmm1,0x8(%edi)\n"
1552    "lea    0xc(%edi),%edi\n"
1553    "sub    $0xc,%ecx\n"
1554    "ja     1b\n"
1555    "popa\n"
1556    "ret\n"
1557);
1558
1559extern "C" void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
1560                                           uint8* dst_ptr, int dst_width);
1561  asm(
1562    ".text\n"
1563#if defined(OSX)
1564    ".globl _ScaleRowDown38_3_Int_SSSE3\n"
1565"_ScaleRowDown38_3_Int_SSSE3:\n"
1566#else
1567    ".global ScaleRowDown38_3_Int_SSSE3\n"
1568"ScaleRowDown38_3_Int_SSSE3:\n"
1569#endif
1570    "pusha\n"
1571    "mov    0x24(%esp),%esi\n"
1572    "mov    0x28(%esp),%edx\n"
1573    "mov    0x2c(%esp),%edi\n"
1574    "mov    0x30(%esp),%ecx\n"
1575    "movdqa _shufac0,%xmm4\n"
1576    "movdqa _shufac3,%xmm5\n"
1577    "movdqa _scaleac3,%xmm6\n"
1578    "pxor   %xmm7,%xmm7\n"
1579
1580"1:"
1581    "movdqa (%esi),%xmm0\n"
1582    "movdqa (%esi,%edx,1),%xmm2\n"
1583    "movhlps %xmm0,%xmm1\n"
1584    "movhlps %xmm2,%xmm3\n"
1585    "punpcklbw %xmm7,%xmm0\n"
1586    "punpcklbw %xmm7,%xmm1\n"
1587    "punpcklbw %xmm7,%xmm2\n"
1588    "punpcklbw %xmm7,%xmm3\n"
1589    "paddusw %xmm2,%xmm0\n"
1590    "paddusw %xmm3,%xmm1\n"
1591    "movdqa (%esi,%edx,2),%xmm2\n"
1592    "lea    0x10(%esi),%esi\n"
1593    "movhlps %xmm2,%xmm3\n"
1594    "punpcklbw %xmm7,%xmm2\n"
1595    "punpcklbw %xmm7,%xmm3\n"
1596    "paddusw %xmm2,%xmm0\n"
1597    "paddusw %xmm3,%xmm1\n"
1598    "movdqa %xmm0,%xmm2\n"
1599    "psrldq $0x2,%xmm0\n"
1600    "paddusw %xmm0,%xmm2\n"
1601    "psrldq $0x2,%xmm0\n"
1602    "paddusw %xmm0,%xmm2\n"
1603    "pshufb %xmm4,%xmm2\n"
1604    "movdqa %xmm1,%xmm3\n"
1605    "psrldq $0x2,%xmm1\n"
1606    "paddusw %xmm1,%xmm3\n"
1607    "psrldq $0x2,%xmm1\n"
1608    "paddusw %xmm1,%xmm3\n"
1609    "pshufb %xmm5,%xmm3\n"
1610    "paddusw %xmm3,%xmm2\n"
1611    "pmulhuw %xmm6,%xmm2\n"
1612    "packuswb %xmm2,%xmm2\n"
1613    "movd   %xmm2,(%edi)\n"
1614    "pextrw $0x2,%xmm2,%eax\n"
1615    "mov    %ax,0x4(%edi)\n"
1616    "lea    0x6(%edi),%edi\n"
1617    "sub    $0x6,%ecx\n"
1618    "ja     1b\n"
1619    "popa\n"
1620    "ret\n"
1621);
1622
1623extern "C" void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
1624                                           uint8* dst_ptr, int dst_width);
1625  asm(
1626    ".text\n"
1627#if defined(OSX)
1628    ".globl _ScaleRowDown38_2_Int_SSSE3\n"
1629"_ScaleRowDown38_2_Int_SSSE3:\n"
1630#else
1631    ".global ScaleRowDown38_2_Int_SSSE3\n"
1632"ScaleRowDown38_2_Int_SSSE3:\n"
1633#endif
1634    "pusha\n"
1635    "mov    0x24(%esp),%esi\n"
1636    "mov    0x28(%esp),%edx\n"
1637    "mov    0x2c(%esp),%edi\n"
1638    "mov    0x30(%esp),%ecx\n"
1639    "movdqa _shufab0,%xmm4\n"
1640    "movdqa _shufab1,%xmm5\n"
1641    "movdqa _shufab2,%xmm6\n"
1642    "movdqa _scaleab2,%xmm7\n"
1643
1644"1:"
1645    "movdqa (%esi),%xmm2\n"
1646    "pavgb  (%esi,%edx,1),%xmm2\n"
1647    "lea    0x10(%esi),%esi\n"
1648    "movdqa %xmm2,%xmm0\n"
1649    "pshufb %xmm4,%xmm0\n"
1650    "movdqa %xmm2,%xmm1\n"
1651    "pshufb %xmm5,%xmm1\n"
1652    "paddusw %xmm1,%xmm0\n"
1653    "pshufb %xmm6,%xmm2\n"
1654    "paddusw %xmm2,%xmm0\n"
1655    "pmulhuw %xmm7,%xmm0\n"
1656    "packuswb %xmm0,%xmm0\n"
1657    "movd   %xmm0,(%edi)\n"
1658    "pextrw $0x2,%xmm0,%eax\n"
1659    "mov    %ax,0x4(%edi)\n"
1660    "lea    0x6(%edi),%edi\n"
1661    "sub    $0x6,%ecx\n"
1662    "ja     1b\n"
1663    "popa\n"
1664    "ret\n"
1665);
1666#endif // __PIC__
1667
1668#define HAS_SCALEADDROWS_SSE2
1669extern "C" void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
1670                                  uint16* dst_ptr, int src_width,
1671                                  int src_height);
1672  asm(
1673    ".text\n"
1674#if defined(OSX)
1675    ".globl _ScaleAddRows_SSE2\n"
1676"_ScaleAddRows_SSE2:\n"
1677#else
1678    ".global ScaleAddRows_SSE2\n"
1679"ScaleAddRows_SSE2:\n"
1680#endif
1681    "pusha\n"
1682    "mov    0x24(%esp),%esi\n"
1683    "mov    0x28(%esp),%edx\n"
1684    "mov    0x2c(%esp),%edi\n"
1685    "mov    0x30(%esp),%ecx\n"
1686    "mov    0x34(%esp),%ebx\n"
1687    "pxor   %xmm7,%xmm7\n"
1688
1689"1:"
1690    "movdqa (%esi),%xmm2\n"
1691    "lea    (%esi,%edx,1),%eax\n"
1692    "movhlps %xmm2,%xmm3\n"
1693    "lea    -0x1(%ebx),%ebp\n"
1694    "punpcklbw %xmm7,%xmm2\n"
1695    "punpcklbw %xmm7,%xmm3\n"
1696
1697"2:"
1698    "movdqa (%eax),%xmm0\n"
1699    "lea    (%eax,%edx,1),%eax\n"
1700    "movhlps %xmm0,%xmm1\n"
1701    "punpcklbw %xmm7,%xmm0\n"
1702    "punpcklbw %xmm7,%xmm1\n"
1703    "paddusw %xmm0,%xmm2\n"
1704    "paddusw %xmm1,%xmm3\n"
1705    "sub    $0x1,%ebp\n"
1706    "ja     2b\n"
1707
1708    "movdqa %xmm2,(%edi)\n"
1709    "movdqa %xmm3,0x10(%edi)\n"
1710    "lea    0x20(%edi),%edi\n"
1711    "lea    0x10(%esi),%esi\n"
1712    "sub    $0x10,%ecx\n"
1713    "ja     1b\n"
1714    "popa\n"
1715    "ret\n"
1716);
1717
1718// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
1719#define HAS_SCALEFILTERROWS_SSE2
1720extern "C" void ScaleFilterRows_SSE2(uint8* dst_ptr,
1721                                     const uint8* src_ptr, int src_stride,
1722                                     int dst_width, int source_y_fraction);
1723  asm(
1724    ".text\n"
1725#if defined(OSX)
1726    ".globl _ScaleFilterRows_SSE2\n"
1727"_ScaleFilterRows_SSE2:\n"
1728#else
1729    ".global ScaleFilterRows_SSE2\n"
1730"ScaleFilterRows_SSE2:\n"
1731#endif
1732    "push   %esi\n"
1733    "push   %edi\n"
1734    "mov    0xc(%esp),%edi\n"
1735    "mov    0x10(%esp),%esi\n"
1736    "mov    0x14(%esp),%edx\n"
1737    "mov    0x18(%esp),%ecx\n"
1738    "mov    0x1c(%esp),%eax\n"
1739    "cmp    $0x0,%eax\n"
1740    "je     2f\n"
1741    "cmp    $0x80,%eax\n"
1742    "je     3f\n"
1743    "movd   %eax,%xmm6\n"
1744    "punpcklwd %xmm6,%xmm6\n"
1745    "pshufd $0x0,%xmm6,%xmm6\n"
1746    "neg    %eax\n"
1747    "add    $0x100,%eax\n"
1748    "movd   %eax,%xmm5\n"
1749    "punpcklwd %xmm5,%xmm5\n"
1750    "pshufd $0x0,%xmm5,%xmm5\n"
1751    "pxor   %xmm7,%xmm7\n"
1752
1753"1:"
1754    "movdqa (%esi),%xmm0\n"
1755    "movdqa (%esi,%edx,1),%xmm2\n"
1756    "lea    0x10(%esi),%esi\n"
1757    "movdqa %xmm0,%xmm1\n"
1758    "movdqa %xmm2,%xmm3\n"
1759    "punpcklbw %xmm7,%xmm0\n"
1760    "punpcklbw %xmm7,%xmm2\n"
1761    "punpckhbw %xmm7,%xmm1\n"
1762    "punpckhbw %xmm7,%xmm3\n"
1763    "pmullw %xmm5,%xmm0\n"
1764    "pmullw %xmm5,%xmm1\n"
1765    "pmullw %xmm6,%xmm2\n"
1766    "pmullw %xmm6,%xmm3\n"
1767    "paddusw %xmm2,%xmm0\n"
1768    "paddusw %xmm3,%xmm1\n"
1769    "psrlw  $0x8,%xmm0\n"
1770    "psrlw  $0x8,%xmm1\n"
1771    "packuswb %xmm1,%xmm0\n"
1772    "movdqa %xmm0,(%edi)\n"
1773    "lea    0x10(%edi),%edi\n"
1774    "sub    $0x10,%ecx\n"
1775    "ja     1b\n"
1776    "mov    -0x1(%edi),%al\n"
1777    "mov    %al,(%edi)\n"
1778    "pop    %edi\n"
1779    "pop    %esi\n"
1780    "ret\n"
1781
1782"2:"
1783    "movdqa (%esi),%xmm0\n"
1784    "lea    0x10(%esi),%esi\n"
1785    "movdqa %xmm0,(%edi)\n"
1786    "lea    0x10(%edi),%edi\n"
1787    "sub    $0x10,%ecx\n"
1788    "ja     2b\n"
1789
1790    "mov    -0x1(%edi),%al\n"
1791    "mov    %al,(%edi)\n"
1792    "pop    %edi\n"
1793    "pop    %esi\n"
1794    "ret\n"
1795
1796"3:"
1797    "movdqa (%esi),%xmm0\n"
1798    "movdqa (%esi,%edx,1),%xmm2\n"
1799    "lea    0x10(%esi),%esi\n"
1800    "pavgb  %xmm2,%xmm0\n"
1801    "movdqa %xmm0,(%edi)\n"
1802    "lea    0x10(%edi),%edi\n"
1803    "sub    $0x10,%ecx\n"
1804    "ja     3b\n"
1805
1806    "mov    -0x1(%edi),%al\n"
1807    "mov    %al,(%edi)\n"
1808    "pop    %edi\n"
1809    "pop    %esi\n"
1810    "ret\n"
1811);
1812
1813// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
1814#define HAS_SCALEFILTERROWS_SSSE3
1815extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
1816                                      const uint8* src_ptr, int src_stride,
1817                                      int dst_width, int source_y_fraction);
1818  asm(
1819    ".text\n"
1820#if defined(OSX)
1821    ".globl _ScaleFilterRows_SSSE3\n"
1822"_ScaleFilterRows_SSSE3:\n"
1823#else
1824    ".global ScaleFilterRows_SSSE3\n"
1825"ScaleFilterRows_SSSE3:\n"
1826#endif
1827    "push   %esi\n"
1828    "push   %edi\n"
1829    "mov    0xc(%esp),%edi\n"
1830    "mov    0x10(%esp),%esi\n"
1831    "mov    0x14(%esp),%edx\n"
1832    "mov    0x18(%esp),%ecx\n"
1833    "mov    0x1c(%esp),%eax\n"
1834    "cmp    $0x0,%eax\n"
1835    "je     2f\n"
1836    "cmp    $0x80,%eax\n"
1837    "je     3f\n"
1838    "shr    %eax\n"
1839    "mov    %al,%ah\n"
1840    "neg    %al\n"
1841    "add    $0x80,%al\n"
1842    "movd   %eax,%xmm7\n"
1843    "punpcklwd %xmm7,%xmm7\n"
1844    "pshufd $0x0,%xmm7,%xmm7\n"
1845
1846"1:"
1847    "movdqa (%esi),%xmm0\n"
1848    "movdqa (%esi,%edx,1),%xmm2\n"
1849    "lea    0x10(%esi),%esi\n"
1850    "movdqa %xmm0,%xmm1\n"
1851    "punpcklbw %xmm2,%xmm0\n"
1852    "punpckhbw %xmm2,%xmm1\n"
1853    "pmaddubsw %xmm7,%xmm0\n"
1854    "pmaddubsw %xmm7,%xmm1\n"
1855    "psrlw  $0x7,%xmm0\n"
1856    "psrlw  $0x7,%xmm1\n"
1857    "packuswb %xmm1,%xmm0\n"
1858    "movdqa %xmm0,(%edi)\n"
1859    "lea    0x10(%edi),%edi\n"
1860    "sub    $0x10,%ecx\n"
1861    "ja     1b\n"
1862    "mov    -0x1(%edi),%al\n"
1863    "mov    %al,(%edi)\n"
1864    "pop    %edi\n"
1865    "pop    %esi\n"
1866    "ret\n"
1867
1868"2:"
1869    "movdqa (%esi),%xmm0\n"
1870    "lea    0x10(%esi),%esi\n"
1871    "movdqa %xmm0,(%edi)\n"
1872    "lea    0x10(%edi),%edi\n"
1873    "sub    $0x10,%ecx\n"
1874    "ja     2b\n"
1875    "mov    -0x1(%edi),%al\n"
1876    "mov    %al,(%edi)\n"
1877    "pop    %edi\n"
1878    "pop    %esi\n"
1879    "ret\n"
1880
1881"3:"
1882    "movdqa (%esi),%xmm0\n"
1883    "movdqa (%esi,%edx,1),%xmm2\n"
1884    "lea    0x10(%esi),%esi\n"
1885    "pavgb  %xmm2,%xmm0\n"
1886    "movdqa %xmm0,(%edi)\n"
1887    "lea    0x10(%edi),%edi\n"
1888    "sub    $0x10,%ecx\n"
1889    "ja     3b\n"
1890    "mov    -0x1(%edi),%al\n"
1891    "mov    %al,(%edi)\n"
1892    "pop    %edi\n"
1893    "pop    %esi\n"
1894    "ret\n"
1895);
1896
1897#elif defined(__x86_64__)
1898static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
1899                                  uint8* dst_ptr, int dst_width) {
1900  asm volatile(
1901  "lea        (%3,%3,2),%%r10\n"
1902  "pxor       %%xmm7,%%xmm7\n"
1903"1:"
1904  "movdqa     (%0),%%xmm0\n"
1905  "movdqa     0x10(%0),%%xmm1\n"
1906  "movdqa     (%0,%3,1),%%xmm2\n"
1907  "movdqa     0x10(%0,%3,1),%%xmm3\n"
1908  "pavgb      %%xmm2,%%xmm0\n"
1909  "pavgb      %%xmm3,%%xmm1\n"
1910  "movdqa     (%0,%3,2),%%xmm2\n"
1911  "movdqa     0x10(%0,%3,2),%%xmm3\n"
1912  "movdqa     (%0,%%r10,1),%%xmm4\n"
1913  "movdqa     0x10(%0,%%r10,1),%%xmm5\n"
1914  "lea        (%0,%3,4),%%r11\n"
1915  "lea        0x20(%0),%0\n"
1916  "pavgb      %%xmm4,%%xmm2\n"
1917  "pavgb      %%xmm5,%%xmm3\n"
1918  "pavgb      %%xmm2,%%xmm0\n"
1919  "pavgb      %%xmm3,%%xmm1\n"
1920  "movdqa     0x0(%%r11),%%xmm2\n"
1921  "movdqa     0x10(%%r11),%%xmm3\n"
1922  "movdqa     0x0(%%r11,%3,1),%%xmm4\n"
1923  "movdqa     0x10(%%r11,%3,1),%%xmm5\n"
1924  "pavgb      %%xmm4,%%xmm2\n"
1925  "pavgb      %%xmm5,%%xmm3\n"
1926  "movdqa     0x0(%%r11,%3,2),%%xmm4\n"
1927  "movdqa     0x10(%%r11,%3,2),%%xmm5\n"
1928  "movdqa     0x0(%%r11,%%r10,1),%%xmm6\n"
1929  "pavgb      %%xmm6,%%xmm4\n"
1930  "movdqa     0x10(%%r11,%%r10,1),%%xmm6\n"
1931  "pavgb      %%xmm6,%%xmm5\n"
1932  "pavgb      %%xmm4,%%xmm2\n"
1933  "pavgb      %%xmm5,%%xmm3\n"
1934  "pavgb      %%xmm2,%%xmm0\n"
1935  "pavgb      %%xmm3,%%xmm1\n"
1936  "psadbw     %%xmm7,%%xmm0\n"
1937  "psadbw     %%xmm7,%%xmm1\n"
1938  "pshufd     $0xd8,%%xmm0,%%xmm0\n"
1939  "pshufd     $0x8d,%%xmm1,%%xmm1\n"
1940  "por        %%xmm1,%%xmm0\n"
1941  "psrlw      $0x3,%%xmm0\n"
1942  "packuswb   %%xmm0,%%xmm0\n"
1943  "packuswb   %%xmm0,%%xmm0\n"
1944  "movd       %%xmm0,(%1)\n"
1945  "lea        0x4(%1),%1\n"
1946  "sub        $0x4,%2\n"
1947  "ja         1b\n"
1948  : "+r"(src_ptr),     // %0
1949    "+r"(dst_ptr),     // %1
1950    "+r"(dst_width)    // %2
1951  : "r"(static_cast<intptr_t>(src_stride))   // %3
1952  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3",
1953    "xmm4", "xmm5", "xmm6", "xmm7"
1954);
1955}
1956
1957#define HAS_SCALEROWDOWN34_SSSE3
1958static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
1959                                 uint8* dst_ptr, int dst_width) {
1960  asm volatile(
1961  "movdqa     (%3),%%xmm3\n"
1962  "movdqa     (%4),%%xmm4\n"
1963  "movdqa     (%5),%%xmm5\n"
1964"1:"
1965  "movdqa     (%0),%%xmm0\n"
1966  "movdqa     0x10(%0),%%xmm2\n"
1967  "lea        0x20(%0),%0\n"
1968  "movdqa     %%xmm2,%%xmm1\n"
1969  "palignr    $0x8,%%xmm0,%%xmm1\n"
1970  "pshufb     %%xmm3,%%xmm0\n"
1971  "pshufb     %%xmm4,%%xmm1\n"
1972  "pshufb     %%xmm5,%%xmm2\n"
1973  "movq       %%xmm0,(%1)\n"
1974  "movq       %%xmm1,0x8(%1)\n"
1975  "movq       %%xmm2,0x10(%1)\n"
1976  "lea        0x18(%1),%1\n"
1977  "sub        $0x18,%2\n"
1978  "ja         1b\n"
1979  : "+r"(src_ptr),     // %0
1980    "+r"(dst_ptr),     // %1
1981    "+r"(dst_width)    // %2
1982  : "r"(_shuf0),   // %3
1983    "r"(_shuf1),   // %4
1984    "r"(_shuf2)    // %5
1985  : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1986);
1987}
1988
1989static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
1990                                       uint8* dst_ptr, int dst_width) {
1991  asm volatile(
1992  "movdqa     (%4),%%xmm2\n"  // _shuf01
1993  "movdqa     (%5),%%xmm3\n"  // _shuf11
1994  "movdqa     (%6),%%xmm4\n"  // _shuf21
1995  "movdqa     (%7),%%xmm5\n"  // _madd01
1996  "movdqa     (%8),%%xmm6\n"  // _madd11
1997  "movdqa     (%9),%%xmm7\n"  // _round34
1998  "movdqa     (%10),%%xmm8\n"  // _madd21
1999"1:"
2000  "movdqa     (%0),%%xmm0\n"
2001  "movdqa     (%0,%3),%%xmm1\n"
2002  "pavgb      %%xmm1,%%xmm0\n"
2003  "pshufb     %%xmm2,%%xmm0\n"
2004  "pmaddubsw  %%xmm5,%%xmm0\n"
2005  "paddsw     %%xmm7,%%xmm0\n"
2006  "psrlw      $0x2,%%xmm0\n"
2007  "packuswb   %%xmm0,%%xmm0\n"
2008  "movq       %%xmm0,(%1)\n"
2009  "movdqu     0x8(%0),%%xmm0\n"
2010  "movdqu     0x8(%0,%3),%%xmm1\n"
2011  "pavgb      %%xmm1,%%xmm0\n"
2012  "pshufb     %%xmm3,%%xmm0\n"
2013  "pmaddubsw  %%xmm6,%%xmm0\n"
2014  "paddsw     %%xmm7,%%xmm0\n"
2015  "psrlw      $0x2,%%xmm0\n"
2016  "packuswb   %%xmm0,%%xmm0\n"
2017  "movq       %%xmm0,0x8(%1)\n"
2018  "movdqa     0x10(%0),%%xmm0\n"
2019  "movdqa     0x10(%0,%3),%%xmm1\n"
2020  "lea        0x20(%0),%0\n"
2021  "pavgb      %%xmm1,%%xmm0\n"
2022  "pshufb     %%xmm4,%%xmm0\n"
2023  "pmaddubsw  %%xmm8,%%xmm0\n"
2024  "paddsw     %%xmm7,%%xmm0\n"
2025  "psrlw      $0x2,%%xmm0\n"
2026  "packuswb   %%xmm0,%%xmm0\n"
2027  "movq       %%xmm0,0x10(%1)\n"
2028  "lea        0x18(%1),%1\n"
2029  "sub        $0x18,%2\n"
2030  "ja         1b\n"
2031  : "+r"(src_ptr),     // %0
2032    "+r"(dst_ptr),     // %1
2033    "+r"(dst_width)    // %2
2034  : "r"(static_cast<intptr_t>(src_stride)),  // %3
2035    "r"(_shuf01),   // %4
2036    "r"(_shuf11),   // %5
2037    "r"(_shuf21),   // %6
2038    "r"(_madd01),   // %7
2039    "r"(_madd11),   // %8
2040    "r"(_round34),  // %9
2041    "r"(_madd21)    // %10
2042  : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
2043    "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
2044);
2045}
2046
2047static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
2048                                       uint8* dst_ptr, int dst_width) {
2049  asm volatile(
2050  "movdqa     (%4),%%xmm2\n"  // _shuf01
2051  "movdqa     (%5),%%xmm3\n"  // _shuf11
2052  "movdqa     (%6),%%xmm4\n"  // _shuf21
2053  "movdqa     (%7),%%xmm5\n"  // _madd01
2054  "movdqa     (%8),%%xmm6\n"  // _madd11
2055  "movdqa     (%9),%%xmm7\n"  // _round34
2056  "movdqa     (%10),%%xmm8\n"  // _madd21
2057"1:"
2058  "movdqa     (%0),%%xmm0\n"
2059  "movdqa     (%0,%3,1),%%xmm1\n"
2060  "pavgb      %%xmm0,%%xmm1\n"
2061  "pavgb      %%xmm1,%%xmm0\n"
2062  "pshufb     %%xmm2,%%xmm0\n"
2063  "pmaddubsw  %%xmm5,%%xmm0\n"
2064  "paddsw     %%xmm7,%%xmm0\n"
2065  "psrlw      $0x2,%%xmm0\n"
2066  "packuswb   %%xmm0,%%xmm0\n"
2067  "movq       %%xmm0,(%1)\n"
2068  "movdqu     0x8(%0),%%xmm0\n"
2069  "movdqu     0x8(%0,%3,1),%%xmm1\n"
2070  "pavgb      %%xmm0,%%xmm1\n"
2071  "pavgb      %%xmm1,%%xmm0\n"
2072  "pshufb     %%xmm3,%%xmm0\n"
2073  "pmaddubsw  %%xmm6,%%xmm0\n"
2074  "paddsw     %%xmm7,%%xmm0\n"
2075  "psrlw      $0x2,%%xmm0\n"
2076  "packuswb   %%xmm0,%%xmm0\n"
2077  "movq       %%xmm0,0x8(%1)\n"
2078  "movdqa     0x10(%0),%%xmm0\n"
2079  "movdqa     0x10(%0,%3,1),%%xmm1\n"
2080  "lea        0x20(%0),%0\n"
2081  "pavgb      %%xmm0,%%xmm1\n"
2082  "pavgb      %%xmm1,%%xmm0\n"
2083  "pshufb     %%xmm4,%%xmm0\n"
2084  "pmaddubsw  %%xmm8,%%xmm0\n"
2085  "paddsw     %%xmm7,%%xmm0\n"
2086  "psrlw      $0x2,%%xmm0\n"
2087  "packuswb   %%xmm0,%%xmm0\n"
2088  "movq       %%xmm0,0x10(%1)\n"
2089  "lea        0x18(%1),%1\n"
2090  "sub        $0x18,%2\n"
2091  "ja         1b\n"
2092  : "+r"(src_ptr),     // %0
2093    "+r"(dst_ptr),     // %1
2094    "+r"(dst_width)    // %2
2095  : "r"(static_cast<intptr_t>(src_stride)),  // %3
2096    "r"(_shuf01),   // %4
2097    "r"(_shuf11),   // %5
2098    "r"(_shuf21),   // %6
2099    "r"(_madd01),   // %7
2100    "r"(_madd11),   // %8
2101    "r"(_round34),  // %9
2102    "r"(_madd21)    // %10
2103  : "memory", "xmm0", "xmm1", "xmm2", "xmm3",
2104    "xmm4", "xmm5", "xmm6", "xmm7", "xmm8"
2105);
2106}
2107
2108#define HAS_SCALEROWDOWN38_SSSE3
2109static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
2110                                 uint8* dst_ptr, int dst_width) {
2111  asm volatile(
2112  "movdqa     (%3),%%xmm5\n"
2113  "movdqa     (%4),%%xmm6\n"
2114  "pxor       %%xmm7,%%xmm7\n"
2115"1:"
2116  "movdqa     (%0),%%xmm0\n"
2117  "movdqa     0x10(%0),%%xmm1\n"
2118  "lea        0x20(%0),%0\n"
2119  "pshufb     %%xmm5,%%xmm0\n"
2120  "pshufb     %%xmm6,%%xmm1\n"
2121  "paddusb    %%xmm1,%%xmm0\n"
2122  "movq       %%xmm0,(%1)\n"
2123  "movhlps    %%xmm0,%%xmm1\n"
2124  "movd       %%xmm1,0x8(%1)\n"
2125  "lea        0xc(%1),%1\n"
2126  "sub        $0xc,%2\n"
2127  "ja         1b\n"
2128  : "+r"(src_ptr),     // %0
2129    "+r"(dst_ptr),     // %1
2130    "+r"(dst_width)    // %2
2131  : "r"(_shuf38a),  // %3
2132    "r"(_shuf38b)   // %4
2133  : "memory", "xmm0", "xmm1", "xmm5", "xmm6", "xmm7"
2134);
2135}
2136
2137static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
2138                                       uint8* dst_ptr, int dst_width) {
2139  asm volatile(
2140  "movdqa     (%4),%%xmm4\n"
2141  "movdqa     (%5),%%xmm5\n"
2142  "movdqa     (%6),%%xmm6\n"
2143  "pxor       %%xmm7,%%xmm7\n"
2144"1:"
2145  "movdqa     (%0),%%xmm0\n"
2146  "movdqa     (%0,%3,1),%%xmm2\n"
2147  "movhlps    %%xmm0,%%xmm1\n"
2148  "movhlps    %%xmm2,%%xmm3\n"
2149  "punpcklbw  %%xmm7,%%xmm0\n"
2150  "punpcklbw  %%xmm7,%%xmm1\n"
2151  "punpcklbw  %%xmm7,%%xmm2\n"
2152  "punpcklbw  %%xmm7,%%xmm3\n"
2153  "paddusw    %%xmm2,%%xmm0\n"
2154  "paddusw    %%xmm3,%%xmm1\n"
2155  "movdqa     (%0,%3,2),%%xmm2\n"
2156  "lea        0x10(%0),%0\n"
2157  "movhlps    %%xmm2,%%xmm3\n"
2158  "punpcklbw  %%xmm7,%%xmm2\n"
2159  "punpcklbw  %%xmm7,%%xmm3\n"
2160  "paddusw    %%xmm2,%%xmm0\n"
2161  "paddusw    %%xmm3,%%xmm1\n"
2162  "movdqa     %%xmm0,%%xmm2\n"
2163  "psrldq     $0x2,%%xmm0\n"
2164  "paddusw    %%xmm0,%%xmm2\n"
2165  "psrldq     $0x2,%%xmm0\n"
2166  "paddusw    %%xmm0,%%xmm2\n"
2167  "pshufb     %%xmm4,%%xmm2\n"
2168  "movdqa     %%xmm1,%%xmm3\n"
2169  "psrldq     $0x2,%%xmm1\n"
2170  "paddusw    %%xmm1,%%xmm3\n"
2171  "psrldq     $0x2,%%xmm1\n"
2172  "paddusw    %%xmm1,%%xmm3\n"
2173  "pshufb     %%xmm5,%%xmm3\n"
2174  "paddusw    %%xmm3,%%xmm2\n"
2175  "pmulhuw    %%xmm6,%%xmm2\n"
2176  "packuswb   %%xmm2,%%xmm2\n"
2177  "movd       %%xmm2,(%1)\n"
2178  "pextrw     $0x2,%%xmm2,%%eax\n"
2179  "mov        %%ax,0x4(%1)\n"
2180  "lea        0x6(%1),%1\n"
2181  "sub        $0x6,%2\n"
2182  "ja         1b\n"
2183  : "+r"(src_ptr),     // %0
2184    "+r"(dst_ptr),     // %1
2185    "+r"(dst_width)    // %2
2186  : "r"(static_cast<intptr_t>(src_stride)),  // %3
2187    "r"(_shufac0),   // %4
2188    "r"(_shufac3),   // %5
2189    "r"(_scaleac3)   // %6
2190  : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
2191    "xmm4", "xmm5", "xmm6", "xmm7"
2192);
2193}
2194
2195static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
2196                                       uint8* dst_ptr, int dst_width) {
2197  asm volatile(
2198  "movdqa     (%4),%%xmm4\n"
2199  "movdqa     (%5),%%xmm5\n"
2200  "movdqa     (%6),%%xmm6\n"
2201  "movdqa     (%7),%%xmm7\n"
2202"1:"
2203  "movdqa     (%0),%%xmm2\n"
2204  "pavgb      (%0,%3,1),%%xmm2\n"
2205  "lea        0x10(%0),%0\n"
2206  "movdqa     %%xmm2,%%xmm0\n"
2207  "pshufb     %%xmm4,%%xmm0\n"
2208  "movdqa     %%xmm2,%%xmm1\n"
2209  "pshufb     %%xmm5,%%xmm1\n"
2210  "paddusw    %%xmm1,%%xmm0\n"
2211  "pshufb     %%xmm6,%%xmm2\n"
2212  "paddusw    %%xmm2,%%xmm0\n"
2213  "pmulhuw    %%xmm7,%%xmm0\n"
2214  "packuswb   %%xmm0,%%xmm0\n"
2215  "movd       %%xmm0,(%1)\n"
2216  "pextrw     $0x2,%%xmm0,%%eax\n"
2217  "mov        %%ax,0x4(%1)\n"
2218  "lea        0x6(%1),%1\n"
2219  "sub        $0x6,%2\n"
2220  "ja         1b\n"
2221  : "+r"(src_ptr),     // %0
2222    "+r"(dst_ptr),     // %1
2223    "+r"(dst_width)    // %2
2224  : "r"(static_cast<intptr_t>(src_stride)),  // %3
2225    "r"(_shufab0),   // %4
2226    "r"(_shufab1),   // %5
2227    "r"(_shufab2),   // %6
2228    "r"(_scaleab2)   // %7
2229  : "memory", "rax", "xmm0", "xmm1", "xmm2",
2230    "xmm4", "xmm5", "xmm6", "xmm7"
2231);
2232}
2233
2234#define HAS_SCALEADDROWS_SSE2
2235static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
2236                              uint16* dst_ptr, int src_width,
2237                              int src_height) {
2238  asm volatile(
2239  "pxor       %%xmm7,%%xmm7\n"
2240"1:"
2241  "movdqa     (%0),%%xmm2\n"
2242  "lea        (%0,%4,1),%%r10\n"
2243  "movhlps    %%xmm2,%%xmm3\n"
2244  "lea        -0x1(%3),%%r11\n"
2245  "punpcklbw  %%xmm7,%%xmm2\n"
2246  "punpcklbw  %%xmm7,%%xmm3\n"
2247
2248"2:"
2249  "movdqa     (%%r10),%%xmm0\n"
2250  "lea        (%%r10,%4,1),%%r10\n"
2251  "movhlps    %%xmm0,%%xmm1\n"
2252  "punpcklbw  %%xmm7,%%xmm0\n"
2253  "punpcklbw  %%xmm7,%%xmm1\n"
2254  "paddusw    %%xmm0,%%xmm2\n"
2255  "paddusw    %%xmm1,%%xmm3\n"
2256  "sub        $0x1,%%r11\n"
2257  "ja         2b\n"
2258
2259  "movdqa     %%xmm2,(%1)\n"
2260  "movdqa     %%xmm3,0x10(%1)\n"
2261  "lea        0x20(%1),%1\n"
2262  "lea        0x10(%0),%0\n"
2263  "sub        $0x10,%2\n"
2264  "ja         1b\n"
2265  : "+r"(src_ptr),     // %0
2266    "+r"(dst_ptr),     // %1
2267    "+r"(src_width),   // %2
2268    "+r"(src_height)   // %3
2269  : "r"(static_cast<intptr_t>(src_stride))  // %4
2270  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7"
2271);
2272}
2273
2274// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
2275#define HAS_SCALEFILTERROWS_SSE2
2276static void ScaleFilterRows_SSE2(uint8* dst_ptr,
2277                                 const uint8* src_ptr, int src_stride,
2278                                 int dst_width, int source_y_fraction) {
2279  if (source_y_fraction == 0) {
2280    asm volatile(
2281    "1:"
2282      "movdqa     (%1),%%xmm0\n"
2283      "lea        0x10(%1),%1\n"
2284      "movdqa     %%xmm0,(%0)\n"
2285      "lea        0x10(%0),%0\n"
2286      "sub        $0x10,%2\n"
2287      "ja         1b\n"
2288      "mov        -0x1(%0),%%al\n"
2289      "mov        %%al,(%0)\n"
2290      : "+r"(dst_ptr),     // %0
2291        "+r"(src_ptr),     // %1
2292        "+r"(dst_width)    // %2
2293      :
2294      : "memory", "rax", "xmm0"
2295    );
2296    return;
2297  } else if (source_y_fraction == 128) {
2298    asm volatile(
2299    "1:"
2300      "movdqa     (%1),%%xmm0\n"
2301      "movdqa     (%1,%3,1),%%xmm2\n"
2302      "lea        0x10(%1),%1\n"
2303      "pavgb      %%xmm2,%%xmm0\n"
2304      "movdqa     %%xmm0,(%0)\n"
2305      "lea        0x10(%0),%0\n"
2306      "sub        $0x10,%2\n"
2307      "ja         1b\n"
2308      "mov        -0x1(%0),%%al\n"
2309      "mov        %%al,(%0)\n"
2310      : "+r"(dst_ptr),     // %0
2311        "+r"(src_ptr),     // %1
2312        "+r"(dst_width)    // %2
2313      : "r"(static_cast<intptr_t>(src_stride))  // %3
2314      : "memory", "rax", "xmm0", "xmm2"
2315    );
2316    return;
2317  } else {
2318    asm volatile(
2319      "mov        %3,%%eax\n"
2320      "movd       %%eax,%%xmm6\n"
2321      "punpcklwd  %%xmm6,%%xmm6\n"
2322      "pshufd     $0x0,%%xmm6,%%xmm6\n"
2323      "neg        %%eax\n"
2324      "add        $0x100,%%eax\n"
2325      "movd       %%eax,%%xmm5\n"
2326      "punpcklwd  %%xmm5,%%xmm5\n"
2327      "pshufd     $0x0,%%xmm5,%%xmm5\n"
2328      "pxor       %%xmm7,%%xmm7\n"
2329    "1:"
2330      "movdqa     (%1),%%xmm0\n"
2331      "movdqa     (%1,%4,1),%%xmm2\n"
2332      "lea        0x10(%1),%1\n"
2333      "movdqa     %%xmm0,%%xmm1\n"
2334      "movdqa     %%xmm2,%%xmm3\n"
2335      "punpcklbw  %%xmm7,%%xmm0\n"
2336      "punpcklbw  %%xmm7,%%xmm2\n"
2337      "punpckhbw  %%xmm7,%%xmm1\n"
2338      "punpckhbw  %%xmm7,%%xmm3\n"
2339      "pmullw     %%xmm5,%%xmm0\n"
2340      "pmullw     %%xmm5,%%xmm1\n"
2341      "pmullw     %%xmm6,%%xmm2\n"
2342      "pmullw     %%xmm6,%%xmm3\n"
2343      "paddusw    %%xmm2,%%xmm0\n"
2344      "paddusw    %%xmm3,%%xmm1\n"
2345      "psrlw      $0x8,%%xmm0\n"
2346      "psrlw      $0x8,%%xmm1\n"
2347      "packuswb   %%xmm1,%%xmm0\n"
2348      "movdqa     %%xmm0,(%0)\n"
2349      "lea        0x10(%0),%0\n"
2350      "sub        $0x10,%2\n"
2351      "ja         1b\n"
2352      "mov        -0x1(%0),%%al\n"
2353      "mov        %%al,(%0)\n"
2354      : "+r"(dst_ptr),     // %0
2355        "+r"(src_ptr),     // %1
2356        "+r"(dst_width),   // %2
2357        "+r"(source_y_fraction)  // %3
2358      : "r"(static_cast<intptr_t>(src_stride))  // %4
2359      : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm3",
2360        "xmm5", "xmm6", "xmm7"
2361    );
2362  }
2363  return;
2364}
2365
2366// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
2367#define HAS_SCALEFILTERROWS_SSSE3
2368static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
2369                                  const uint8* src_ptr, int src_stride,
2370                                  int dst_width, int source_y_fraction) {
2371  if (source_y_fraction == 0) {
2372    asm volatile(
2373   "1:"
2374      "movdqa     (%1),%%xmm0\n"
2375      "lea        0x10(%1),%1\n"
2376      "movdqa     %%xmm0,(%0)\n"
2377      "lea        0x10(%0),%0\n"
2378      "sub        $0x10,%2\n"
2379      "ja         1b\n"
2380      "mov        -0x1(%0),%%al\n"
2381      "mov        %%al,(%0)\n"
2382      : "+r"(dst_ptr),     // %0
2383        "+r"(src_ptr),     // %1
2384        "+r"(dst_width)    // %2
2385      :
2386      : "memory", "rax", "xmm0"
2387    );
2388    return;
2389  } else if (source_y_fraction == 128) {
2390    asm volatile(
2391    "1:"
2392      "movdqa     (%1),%%xmm0\n"
2393      "movdqa     (%1,%3,1),%%xmm2\n"
2394      "lea        0x10(%1),%1\n"
2395      "pavgb      %%xmm2,%%xmm0\n"
2396      "movdqa     %%xmm0,(%0)\n"
2397      "lea        0x10(%0),%0\n"
2398      "sub        $0x10,%2\n"
2399      "ja         1b\n"
2400      "mov        -0x1(%0),%%al\n"
2401      "mov        %%al,(%0)\n"
2402      : "+r"(dst_ptr),     // %0
2403        "+r"(src_ptr),     // %1
2404        "+r"(dst_width)    // %2
2405      : "r"(static_cast<intptr_t>(src_stride))  // %3
2406     : "memory", "rax", "xmm0", "xmm2"
2407    );
2408    return;
2409  } else {
2410    asm volatile(
2411      "mov        %3,%%eax\n"
2412      "shr        %%eax\n"
2413      "mov        %%al,%%ah\n"
2414      "neg        %%al\n"
2415      "add        $0x80,%%al\n"
2416      "movd       %%eax,%%xmm7\n"
2417      "punpcklwd  %%xmm7,%%xmm7\n"
2418      "pshufd     $0x0,%%xmm7,%%xmm7\n"
2419    "1:"
2420      "movdqa     (%1),%%xmm0\n"
2421      "movdqa     (%1,%4,1),%%xmm2\n"
2422      "lea        0x10(%1),%1\n"
2423      "movdqa     %%xmm0,%%xmm1\n"
2424      "punpcklbw  %%xmm2,%%xmm0\n"
2425      "punpckhbw  %%xmm2,%%xmm1\n"
2426      "pmaddubsw  %%xmm7,%%xmm0\n"
2427      "pmaddubsw  %%xmm7,%%xmm1\n"
2428      "psrlw      $0x7,%%xmm0\n"
2429      "psrlw      $0x7,%%xmm1\n"
2430      "packuswb   %%xmm1,%%xmm0\n"
2431      "movdqa     %%xmm0,(%0)\n"
2432      "lea        0x10(%0),%0\n"
2433      "sub        $0x10,%2\n"
2434      "ja         1b\n"
2435      "mov        -0x1(%0),%%al\n"
2436      "mov        %%al,(%0)\n"
2437      : "+r"(dst_ptr),     // %0
2438        "+r"(src_ptr),     // %1
2439        "+r"(dst_width),   // %2
2440        "+r"(source_y_fraction)  // %3
2441      : "r"(static_cast<intptr_t>(src_stride))  // %4
2442      : "memory", "rax", "xmm0", "xmm1", "xmm2", "xmm7"
2443    );
2444  }
2445  return;
2446}
2447#endif
2448#endif
2449
2450// CPU agnostic row functions
2451static void ScaleRowDown2_C(const uint8* src_ptr, int,
2452                            uint8* dst, int dst_width) {
2453  for (int x = 0; x < dst_width; ++x) {
2454    *dst++ = *src_ptr;
2455    src_ptr += 2;
2456  }
2457}
2458
2459static void ScaleRowDown2Int_C(const uint8* src_ptr, int src_stride,
2460                               uint8* dst, int dst_width) {
2461  for (int x = 0; x < dst_width; ++x) {
2462    *dst++ = (src_ptr[0] + src_ptr[1] +
2463              src_ptr[src_stride] + src_ptr[src_stride + 1] + 2) >> 2;
2464    src_ptr += 2;
2465  }
2466}
2467
2468static void ScaleRowDown4_C(const uint8* src_ptr, int,
2469                            uint8* dst, int dst_width) {
2470  for (int x = 0; x < dst_width; ++x) {
2471    *dst++ = *src_ptr;
2472    src_ptr += 4;
2473  }
2474}
2475
2476static void ScaleRowDown4Int_C(const uint8* src_ptr, int src_stride,
2477                               uint8* dst, int dst_width) {
2478  for (int x = 0; x < dst_width; ++x) {
2479    *dst++ = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2480              src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2481              src_ptr[src_stride + 2] + src_ptr[src_stride + 3] +
2482              src_ptr[src_stride * 2 + 0] + src_ptr[src_stride * 2 + 1] +
2483              src_ptr[src_stride * 2 + 2] + src_ptr[src_stride * 2 + 3] +
2484              src_ptr[src_stride * 3 + 0] + src_ptr[src_stride * 3 + 1] +
2485              src_ptr[src_stride * 3 + 2] + src_ptr[src_stride * 3 + 3] +
2486              8) >> 4;
2487    src_ptr += 4;
2488  }
2489}
2490
2491// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
2492// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
2493static const int kMaxOutputWidth = 640;
2494static const int kMaxRow12 = kMaxOutputWidth * 2;
2495
2496static void ScaleRowDown8_C(const uint8* src_ptr, int,
2497                            uint8* dst, int dst_width) {
2498  for (int x = 0; x < dst_width; ++x) {
2499    *dst++ = *src_ptr;
2500    src_ptr += 8;
2501  }
2502}
2503
2504// Note calling code checks width is less than max and if not
2505// uses ScaleRowDown8_C instead.
2506static void ScaleRowDown8Int_C(const uint8* src_ptr, int src_stride,
2507                               uint8* dst, int dst_width) {
2508  ALIGN16(uint8 src_row[kMaxRow12 * 2]);
2509  assert(dst_width <= kMaxOutputWidth);
2510  ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
2511  ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
2512                     src_row + kMaxOutputWidth,
2513                     dst_width * 2);
2514  ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
2515}
2516
2517static void ScaleRowDown34_C(const uint8* src_ptr, int,
2518                             uint8* dst, int dst_width) {
2519  assert((dst_width % 3 == 0) && (dst_width > 0));
2520  uint8* dend = dst + dst_width;
2521  do {
2522    dst[0] = src_ptr[0];
2523    dst[1] = src_ptr[1];
2524    dst[2] = src_ptr[3];
2525    dst += 3;
2526    src_ptr += 4;
2527  } while (dst < dend);
2528}
2529
2530// Filter rows 0 and 1 together, 3 : 1
2531static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, int src_stride,
2532                                   uint8* d, int dst_width) {
2533  assert((dst_width % 3 == 0) && (dst_width > 0));
2534  uint8* dend = d + dst_width;
2535  const uint8* s = src_ptr;
2536  const uint8* t = src_ptr + src_stride;
2537  do {
2538    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2539    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2540    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2541    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2542    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2543    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2544    d[0] = (a0 * 3 + b0 + 2) >> 2;
2545    d[1] = (a1 * 3 + b1 + 2) >> 2;
2546    d[2] = (a2 * 3 + b2 + 2) >> 2;
2547    d += 3;
2548    s += 4;
2549    t += 4;
2550  } while (d < dend);
2551}
2552
2553// Filter rows 1 and 2 together, 1 : 1
2554static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, int src_stride,
2555                                   uint8* d, int dst_width) {
2556  assert((dst_width % 3 == 0) && (dst_width > 0));
2557  uint8* dend = d + dst_width;
2558  const uint8* s = src_ptr;
2559  const uint8* t = src_ptr + src_stride;
2560  do {
2561    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2562    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2563    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2564    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2565    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2566    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2567    d[0] = (a0 + b0 + 1) >> 1;
2568    d[1] = (a1 + b1 + 1) >> 1;
2569    d[2] = (a2 + b2 + 1) >> 1;
2570    d += 3;
2571    s += 4;
2572    t += 4;
2573  } while (d < dend);
2574}
2575
2576#if defined(HAS_SCALEFILTERROWS_SSE2)
2577// Filter row to 3/4
2578static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
2579                                int dst_width) {
2580  assert((dst_width % 3 == 0) && (dst_width > 0));
2581  uint8* dend = dst_ptr + dst_width;
2582  const uint8* s = src_ptr;
2583  do {
2584    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2585    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2586    dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2587    dst_ptr += 3;
2588    s += 4;
2589  } while (dst_ptr < dend);
2590}
2591#endif
2592
2593static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
2594                              int dst_width, int dx) {
2595  int x = 0;
2596  for (int j = 0; j < dst_width; ++j) {
2597    int xi = x >> 16;
2598    int xf1 = x & 0xffff;
2599    int xf0 = 65536 - xf1;
2600
2601    *dst_ptr++ = (src_ptr[xi] * xf0 + src_ptr[xi + 1] * xf1) >> 16;
2602    x += dx;
2603  }
2604}
2605
2606static const int kMaxInputWidth = 2560;
2607#if defined(HAS_SCALEFILTERROWS_SSE2)
2608#define HAS_SCALEROWDOWN34_SSE2
2609// Filter rows 0 and 1 together, 3 : 1
2610static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, int src_stride,
2611                                      uint8* dst_ptr, int dst_width) {
2612  assert((dst_width % 3 == 0) && (dst_width > 0));
2613  ALIGN16(uint8 row[kMaxInputWidth]);
2614  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3,
2615                       256 / 4);
2616  ScaleFilterCols34_C(dst_ptr, row, dst_width);
2617}
2618
2619// Filter rows 1 and 2 together, 1 : 1
2620static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, int src_stride,
2621                                      uint8* dst_ptr, int dst_width) {
2622  assert((dst_width % 3 == 0) && (dst_width > 0));
2623  ALIGN16(uint8 row[kMaxInputWidth]);
2624  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
2625  ScaleFilterCols34_C(dst_ptr, row, dst_width);
2626}
2627#endif
2628
2629static void ScaleRowDown38_C(const uint8* src_ptr, int,
2630                             uint8* dst, int dst_width) {
2631  assert(dst_width % 3 == 0);
2632  for (int x = 0; x < dst_width; x += 3) {
2633    dst[0] = src_ptr[0];
2634    dst[1] = src_ptr[3];
2635    dst[2] = src_ptr[6];
2636    dst += 3;
2637    src_ptr += 8;
2638  }
2639}
2640
2641// 8x3 -> 3x1
2642static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, int src_stride,
2643                                   uint8* dst_ptr, int dst_width) {
2644  assert((dst_width % 3 == 0) && (dst_width > 0));
2645  for (int i = 0; i < dst_width; i+=3) {
2646    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2647        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2648        src_ptr[src_stride + 2] + src_ptr[src_stride * 2 + 0] +
2649        src_ptr[src_stride * 2 + 1] + src_ptr[src_stride * 2 + 2]) *
2650        (65536 / 9) >> 16;
2651    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2652        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2653        src_ptr[src_stride + 5] + src_ptr[src_stride * 2 + 3] +
2654        src_ptr[src_stride * 2 + 4] + src_ptr[src_stride * 2 + 5]) *
2655        (65536 / 9) >> 16;
2656    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2657        src_ptr[src_stride + 6] + src_ptr[src_stride + 7] +
2658        src_ptr[src_stride * 2 + 6] + src_ptr[src_stride * 2 + 7]) *
2659        (65536 / 6) >> 16;
2660    src_ptr += 8;
2661    dst_ptr += 3;
2662  }
2663}
2664
2665// 8x2 -> 3x1
2666static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, int src_stride,
2667                                   uint8* dst_ptr, int dst_width) {
2668  assert((dst_width % 3 == 0) && (dst_width > 0));
2669  for (int i = 0; i < dst_width; i+=3) {
2670    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2671        src_ptr[src_stride + 0] + src_ptr[src_stride + 1] +
2672        src_ptr[src_stride + 2]) * (65536 / 6) >> 16;
2673    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2674        src_ptr[src_stride + 3] + src_ptr[src_stride + 4] +
2675        src_ptr[src_stride + 5]) * (65536 / 6) >> 16;
2676    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2677        src_ptr[src_stride + 6] + src_ptr[src_stride + 7]) *
2678        (65536 / 4) >> 16;
2679    src_ptr += 8;
2680    dst_ptr += 3;
2681  }
2682}
2683
2684// C version 8x2 -> 8x1
2685static void ScaleFilterRows_C(uint8* dst_ptr,
2686                              const uint8* src_ptr, int src_stride,
2687                              int dst_width, int source_y_fraction) {
2688  assert(dst_width > 0);
2689  int y1_fraction = source_y_fraction;
2690  int y0_fraction = 256 - y1_fraction;
2691  const uint8* src_ptr1 = src_ptr + src_stride;
2692  uint8* end = dst_ptr + dst_width;
2693  do {
2694    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2695    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2696    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
2697    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
2698    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
2699    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
2700    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
2701    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
2702    src_ptr += 8;
2703    src_ptr1 += 8;
2704    dst_ptr += 8;
2705  } while (dst_ptr < end);
2706  dst_ptr[0] = dst_ptr[-1];
2707}
2708
2709void ScaleAddRows_C(const uint8* src_ptr, int src_stride,
2710                    uint16* dst_ptr, int src_width, int src_height) {
2711  assert(src_width > 0);
2712  assert(src_height > 0);
2713  for (int x = 0; x < src_width; ++x) {
2714    const uint8* s = src_ptr + x;
2715    int sum = 0;
2716    for (int y = 0; y < src_height; ++y) {
2717      sum += s[0];
2718      s += src_stride;
2719    }
2720    dst_ptr[x] = sum;
2721  }
2722}
2723
2724/**
2725 * Scale plane, 1/2
2726 *
2727 * This is an optimized version for scaling down a plane to 1/2 of
2728 * its original size.
2729 *
2730 */
2731static void ScalePlaneDown2(int src_width, int src_height,
2732                            int dst_width, int dst_height,
2733                            int src_stride, int dst_stride,
2734                            const uint8* src_ptr, uint8* dst_ptr,
2735                            FilterMode filtering) {
2736  assert(src_width % 2 == 0);
2737  assert(src_height % 2 == 0);
2738  void (*ScaleRowDown2)(const uint8* src_ptr, int src_stride,
2739                        uint8* dst_ptr, int dst_width);
2740
2741#if defined(HAS_SCALEROWDOWN2_NEON)
2742  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
2743      (dst_width % 16 == 0) && (src_stride % 16 == 0) &&
2744      (dst_stride % 16 == 0) &&
2745      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
2746    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
2747  } else
2748#endif
2749#if defined(HAS_SCALEROWDOWN2_SSE2)
2750  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2751      (dst_width % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
2752      IS_ALIGNED(dst_ptr, 16)) {
2753    ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
2754  } else
2755#endif
2756  {
2757    ScaleRowDown2 = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
2758  }
2759
2760  for (int y = 0; y < dst_height; ++y) {
2761    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
2762    src_ptr += (src_stride << 1);
2763    dst_ptr += dst_stride;
2764  }
2765}
2766
2767/**
2768 * Scale plane, 1/4
2769 *
2770 * This is an optimized version for scaling down a plane to 1/4 of
2771 * its original size.
2772 */
2773static void ScalePlaneDown4(int src_width, int src_height,
2774                            int dst_width, int dst_height,
2775                            int src_stride, int dst_stride,
2776                            const uint8* src_ptr, uint8* dst_ptr,
2777                            FilterMode filtering) {
2778  assert(src_width % 4 == 0);
2779  assert(src_height % 4 == 0);
2780  void (*ScaleRowDown4)(const uint8* src_ptr, int src_stride,
2781                        uint8* dst_ptr, int dst_width);
2782
2783#if defined(HAS_SCALEROWDOWN4_NEON)
2784  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
2785      (dst_width % 2 == 0) && (src_stride % 8 == 0) &&
2786      IS_ALIGNED(src_ptr, 8)) {
2787    ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
2788  } else
2789#endif
2790#if defined(HAS_SCALEROWDOWN4_SSE2)
2791  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2792      (dst_width % 8 == 0) && (src_stride % 16 == 0) &&
2793      (dst_stride % 8 == 0) &&
2794      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2795    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
2796  } else
2797#endif
2798  {
2799    ScaleRowDown4 = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
2800  }
2801
2802  for (int y = 0; y < dst_height; ++y) {
2803    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
2804    src_ptr += (src_stride << 2);
2805    dst_ptr += dst_stride;
2806  }
2807}
2808
2809/**
2810 * Scale plane, 1/8
2811 *
2812 * This is an optimized version for scaling down a plane to 1/8
2813 * of its original size.
2814 *
2815 */
2816static void ScalePlaneDown8(int src_width, int src_height,
2817                            int dst_width, int dst_height,
2818                            int src_stride, int dst_stride,
2819                            const uint8* src_ptr, uint8* dst_ptr,
2820                            FilterMode filtering) {
2821  assert(src_width % 8 == 0);
2822  assert(src_height % 8 == 0);
2823  void (*ScaleRowDown8)(const uint8* src_ptr, int src_stride,
2824                        uint8* dst_ptr, int dst_width);
2825#if defined(HAS_SCALEROWDOWN8_SSE2)
2826  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2827      (dst_width % 16 == 0) && dst_width <= kMaxOutputWidth &&
2828      (src_stride % 16 == 0) && (dst_stride % 16 == 0) &&
2829      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 16)) {
2830    ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
2831  } else
2832#endif
2833  {
2834    ScaleRowDown8 = filtering && (dst_width <= kMaxOutputWidth) ?
2835        ScaleRowDown8Int_C : ScaleRowDown8_C;
2836  }
2837  for (int y = 0; y < dst_height; ++y) {
2838    ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
2839    src_ptr += (src_stride << 3);
2840    dst_ptr += dst_stride;
2841  }
2842}
2843
2844/**
2845 * Scale plane down, 3/4
2846 *
2847 * Provided by Frank Barchard (fbarchard@google.com)
2848 *
2849 */
2850static void ScalePlaneDown34(int src_width, int src_height,
2851                             int dst_width, int dst_height,
2852                             int src_stride, int dst_stride,
2853                             const uint8* src_ptr, uint8* dst_ptr,
2854                             FilterMode filtering) {
2855  assert(dst_width % 3 == 0);
2856  void (*ScaleRowDown34_0)(const uint8* src_ptr, int src_stride,
2857                           uint8* dst_ptr, int dst_width);
2858  void (*ScaleRowDown34_1)(const uint8* src_ptr, int src_stride,
2859                           uint8* dst_ptr, int dst_width);
2860#if defined(HAS_SCALEROWDOWN34_SSSE3)
2861  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
2862      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2863      (dst_stride % 8 == 0) &&
2864      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2865    if (!filtering) {
2866      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
2867      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
2868    } else {
2869      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
2870      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
2871    }
2872  } else
2873#endif
2874#if defined(HAS_SCALEROWDOWN34_SSE2)
2875  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
2876      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2877      (dst_stride % 8 == 0) &&
2878      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8) &&
2879      filtering) {
2880    ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
2881    ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
2882  } else
2883#endif
2884  {
2885    if (!filtering) {
2886      ScaleRowDown34_0 = ScaleRowDown34_C;
2887      ScaleRowDown34_1 = ScaleRowDown34_C;
2888    } else {
2889      ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
2890      ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
2891    }
2892  }
2893  int src_row = 0;
2894  for (int y = 0; y < dst_height; ++y) {
2895    switch (src_row) {
2896      case 0:
2897        ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
2898        break;
2899
2900      case 1:
2901        ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
2902        break;
2903
2904      case 2:
2905        ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
2906                         dst_ptr, dst_width);
2907        break;
2908    }
2909    ++src_row;
2910    src_ptr += src_stride;
2911    dst_ptr += dst_stride;
2912    if (src_row >= 3) {
2913      src_ptr += src_stride;
2914      src_row = 0;
2915    }
2916  }
2917}
2918
2919/**
2920 * Scale plane, 3/8
2921 *
2922 * This is an optimized version for scaling down a plane to 3/8
2923 * of its original size.
2924 *
2925 * Reduces 16x3 to 6x1
2926 */
2927static void ScalePlaneDown38(int src_width, int src_height,
2928                             int dst_width, int dst_height,
2929                             int src_stride, int dst_stride,
2930                             const uint8* src_ptr, uint8* dst_ptr,
2931                             FilterMode filtering) {
2932  assert(dst_width % 3 == 0);
2933  void (*ScaleRowDown38_3)(const uint8* src_ptr, int src_stride,
2934                           uint8* dst_ptr, int dst_width);
2935  void (*ScaleRowDown38_2)(const uint8* src_ptr, int src_stride,
2936                           uint8* dst_ptr, int dst_width);
2937#if defined(HAS_SCALEROWDOWN38_SSSE3)
2938  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
2939      (dst_width % 24 == 0) && (src_stride % 16 == 0) &&
2940      (dst_stride % 8 == 0) &&
2941      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(dst_ptr, 8)) {
2942    if (!filtering) {
2943      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
2944      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
2945    } else {
2946      ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
2947      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
2948    }
2949  } else
2950#endif
2951  {
2952    if (!filtering) {
2953      ScaleRowDown38_3 = ScaleRowDown38_C;
2954      ScaleRowDown38_2 = ScaleRowDown38_C;
2955    } else {
2956      ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
2957      ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
2958    }
2959  }
2960  int src_row = 0;
2961  for (int y = 0; y < dst_height; ++y) {
2962    switch (src_row) {
2963      case 0:
2964      case 1:
2965        ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2966        src_ptr += src_stride * 3;
2967        ++src_row;
2968        break;
2969
2970      case 2:
2971        ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
2972        src_ptr += src_stride * 2;
2973        src_row = 0;
2974        break;
2975    }
2976    dst_ptr += dst_stride;
2977  }
2978}
2979
2980inline static uint32 SumBox(int iboxwidth, int iboxheight,
2981                            int src_stride, const uint8* src_ptr) {
2982  assert(iboxwidth > 0);
2983  assert(iboxheight > 0);
2984  uint32 sum = 0u;
2985  for (int y = 0; y < iboxheight; ++y) {
2986    for (int x = 0; x < iboxwidth; ++x) {
2987      sum += src_ptr[x];
2988    }
2989    src_ptr += src_stride;
2990  }
2991  return sum;
2992}
2993
2994static void ScalePlaneBoxRow(int dst_width, int boxheight,
2995                             int dx, int src_stride,
2996                             const uint8* src_ptr, uint8* dst_ptr) {
2997  int x = 0;
2998  for (int i = 0; i < dst_width; ++i) {
2999    int ix = x >> 16;
3000    x += dx;
3001    int boxwidth = (x >> 16) - ix;
3002    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
3003        (boxwidth * boxheight);
3004  }
3005}
3006
3007inline static uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
3008  assert(iboxwidth > 0);
3009  uint32 sum = 0u;
3010  for (int x = 0; x < iboxwidth; ++x) {
3011    sum += src_ptr[x];
3012  }
3013  return sum;
3014}
3015
3016static void ScaleAddCols2_C(int dst_width, int boxheight, int dx,
3017                            const uint16* src_ptr, uint8* dst_ptr) {
3018  int scaletbl[2];
3019  int minboxwidth = (dx >> 16);
3020  scaletbl[0] = 65536 / (minboxwidth * boxheight);
3021  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
3022  int *scaleptr = scaletbl - minboxwidth;
3023  int x = 0;
3024  for (int i = 0; i < dst_width; ++i) {
3025    int ix = x >> 16;
3026    x += dx;
3027    int boxwidth = (x >> 16) - ix;
3028    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
3029  }
3030}
3031
3032static void ScaleAddCols1_C(int dst_width, int boxheight, int dx,
3033                            const uint16* src_ptr, uint8* dst_ptr) {
3034  int boxwidth = (dx >> 16);
3035  int scaleval = 65536 / (boxwidth * boxheight);
3036  int x = 0;
3037  for (int i = 0; i < dst_width; ++i) {
3038    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
3039    x += boxwidth;
3040  }
3041}
3042
3043/**
3044 * Scale plane down to any dimensions, with interpolation.
3045 * (boxfilter).
3046 *
3047 * Same method as SimpleScale, which is fixed point, outputting
3048 * one pixel of destination using fixed point (16.16) to step
3049 * through source, sampling a box of pixel with simple
3050 * averaging.
3051 */
3052static void ScalePlaneBox(int src_width, int src_height,
3053                          int dst_width, int dst_height,
3054                          int src_stride, int dst_stride,
3055                          const uint8* src_ptr, uint8* dst_ptr) {
3056  assert(dst_width > 0);
3057  assert(dst_height > 0);
3058  int dy = (src_height << 16) / dst_height;
3059  int dx = (src_width << 16) / dst_width;
3060  if ((src_width % 16 != 0) || (src_width > kMaxInputWidth) ||
3061      dst_height * 2 > src_height) {
3062    uint8* dst = dst_ptr;
3063    int dy = (src_height << 16) / dst_height;
3064    int dx = (src_width << 16) / dst_width;
3065    int y = 0;
3066    for (int j = 0; j < dst_height; ++j) {
3067      int iy = y >> 16;
3068      const uint8* const src = src_ptr + iy * src_stride;
3069      y += dy;
3070      if (y > (src_height << 16)) {
3071        y = (src_height << 16);
3072      }
3073      int boxheight = (y >> 16) - iy;
3074      ScalePlaneBoxRow(dst_width, boxheight,
3075                       dx, src_stride,
3076                       src, dst);
3077
3078      dst += dst_stride;
3079    }
3080  } else {
3081    ALIGN16(uint16 row[kMaxInputWidth]);
3082    void (*ScaleAddRows)(const uint8* src_ptr, int src_stride,
3083                         uint16* dst_ptr, int src_width, int src_height);
3084    void (*ScaleAddCols)(int dst_width, int boxheight, int dx,
3085                         const uint16* src_ptr, uint8* dst_ptr);
3086#if defined(HAS_SCALEADDROWS_SSE2)
3087    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
3088        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3089        (src_width % 16) == 0) {
3090      ScaleAddRows = ScaleAddRows_SSE2;
3091    } else
3092#endif
3093    {
3094      ScaleAddRows = ScaleAddRows_C;
3095    }
3096    if (dx & 0xffff) {
3097      ScaleAddCols = ScaleAddCols2_C;
3098    } else {
3099      ScaleAddCols = ScaleAddCols1_C;
3100    }
3101
3102    int y = 0;
3103    for (int j = 0; j < dst_height; ++j) {
3104      int iy = y >> 16;
3105      const uint8* const src = src_ptr + iy * src_stride;
3106      y += dy;
3107      if (y > (src_height << 16)) {
3108        y = (src_height << 16);
3109      }
3110      int boxheight = (y >> 16) - iy;
3111      ScaleAddRows(src, src_stride, row, src_width, boxheight);
3112      ScaleAddCols(dst_width, boxheight, dx, row, dst_ptr);
3113      dst_ptr += dst_stride;
3114    }
3115  }
3116}
3117
3118/**
3119 * Scale plane to/from any dimensions, with interpolation.
3120 */
3121static void ScalePlaneBilinearSimple(int src_width, int src_height,
3122                                     int dst_width, int dst_height,
3123                                     int src_stride, int dst_stride,
3124                                     const uint8* src_ptr, uint8* dst_ptr) {
3125  uint8* dst = dst_ptr;
3126  int dx = (src_width << 16) / dst_width;
3127  int dy = (src_height << 16) / dst_height;
3128  int maxx = ((src_width - 1) << 16) - 1;
3129  int maxy = ((src_height - 1) << 16) - 1;
3130  int y = (dst_height < src_height) ? 32768 :
3131      (src_height << 16) / dst_height - 32768;
3132  for (int i = 0; i < dst_height; ++i) {
3133    int cy = (y < 0) ? 0 : y;
3134    int yi = cy >> 16;
3135    int yf = cy & 0xffff;
3136    const uint8* const src = src_ptr + yi * src_stride;
3137    int x = (dst_width < src_width) ? 32768 :
3138        (src_width << 16) / dst_width - 32768;
3139    for (int j = 0; j < dst_width; ++j) {
3140      int cx = (x < 0) ? 0 : x;
3141      int xi = cx >> 16;
3142      int xf = cx & 0xffff;
3143      int r0 = (src[xi] * (65536 - xf) + src[xi + 1] * xf) >> 16;
3144      int r1 = (src[xi + src_stride] * (65536 - xf) +
3145          src[xi + src_stride + 1] * xf) >> 16;
3146      *dst++ = (r0 * (65536 - yf) + r1 * yf) >> 16;
3147      x += dx;
3148      if (x > maxx)
3149        x = maxx;
3150    }
3151    dst += dst_stride - dst_width;
3152    y += dy;
3153    if (y > maxy)
3154      y = maxy;
3155  }
3156}
3157
3158/**
3159 * Scale plane to/from any dimensions, with bilinear
3160 * interpolation.
3161 */
3162static void ScalePlaneBilinear(int src_width, int src_height,
3163                               int dst_width, int dst_height,
3164                               int src_stride, int dst_stride,
3165                               const uint8* src_ptr, uint8* dst_ptr) {
3166  assert(dst_width > 0);
3167  assert(dst_height > 0);
3168  int dy = (src_height << 16) / dst_height;
3169  int dx = (src_width << 16) / dst_width;
3170  if ((src_width % 8 != 0) || (src_width > kMaxInputWidth)) {
3171    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
3172                             src_stride, dst_stride, src_ptr, dst_ptr);
3173
3174  } else {
3175    ALIGN16(uint8 row[kMaxInputWidth + 1]);
3176    void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
3177                            int src_stride,
3178                            int dst_width, int source_y_fraction);
3179    void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
3180                            int dst_width, int dx);
3181#if defined(HAS_SCALEFILTERROWS_SSSE3)
3182    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
3183        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3184        (src_width % 16) == 0) {
3185      ScaleFilterRows = ScaleFilterRows_SSSE3;
3186    } else
3187#endif
3188#if defined(HAS_SCALEFILTERROWS_SSE2)
3189    if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
3190        (src_stride % 16 == 0) && IS_ALIGNED(src_ptr, 16) &&
3191        (src_width % 16) == 0) {
3192      ScaleFilterRows = ScaleFilterRows_SSE2;
3193    } else
3194#endif
3195    {
3196      ScaleFilterRows = ScaleFilterRows_C;
3197    }
3198    ScaleFilterCols = ScaleFilterCols_C;
3199
3200    int y = 0;
3201    int maxy = ((src_height - 1) << 16) - 1; // max is filter of last 2 rows.
3202    for (int j = 0; j < dst_height; ++j) {
3203      int iy = y >> 16;
3204      int fy = (y >> 8) & 255;
3205      const uint8* const src = src_ptr + iy * src_stride;
3206      ScaleFilterRows(row, src, src_stride, src_width, fy);
3207      ScaleFilterCols(dst_ptr, row, dst_width, dx);
3208      dst_ptr += dst_stride;
3209      y += dy;
3210      if (y > maxy) {
3211        y = maxy;
3212      }
3213    }
3214  }
3215}
3216
3217/**
3218 * Scale plane to/from any dimensions, without interpolation.
3219 * Fixed point math is used for performance: The upper 16 bits
3220 * of x and dx is the integer part of the source position and
3221 * the lower 16 bits are the fixed decimal part.
3222 */
3223static void ScalePlaneSimple(int src_width, int src_height,
3224                             int dst_width, int dst_height,
3225                             int src_stride, int dst_stride,
3226                             const uint8* src_ptr, uint8* dst_ptr) {
3227  uint8* dst = dst_ptr;
3228  int dx = (src_width << 16) / dst_width;
3229  for (int y = 0; y < dst_height; ++y) {
3230    const uint8* const src = src_ptr + (y * src_height / dst_height) *
3231        src_stride;
3232    // TODO(fbarchard): Round X coordinate by setting x=0x8000.
3233    int x = 0;
3234    for (int i = 0; i < dst_width; ++i) {
3235      *dst++ = src[x >> 16];
3236      x += dx;
3237    }
3238    dst += dst_stride - dst_width;
3239  }
3240}
3241
3242/**
3243 * Scale plane to/from any dimensions.
3244 */
3245static void ScalePlaneAnySize(int src_width, int src_height,
3246                              int dst_width, int dst_height,
3247                              int src_stride, int dst_stride,
3248                              const uint8* src_ptr, uint8* dst_ptr,
3249                              FilterMode filtering) {
3250  if (!filtering) {
3251    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3252                     src_stride, dst_stride, src_ptr, dst_ptr);
3253  } else {
3254    // fall back to non-optimized version
3255    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3256                       src_stride, dst_stride, src_ptr, dst_ptr);
3257  }
3258}
3259
3260/**
3261 * Scale plane down, any size
3262 *
3263 * This is an optimized version for scaling down a plane to any size.
3264 * The current implementation is ~10 times faster compared to the
3265 * reference implementation for e.g. XGA->LowResPAL
3266 *
3267 */
3268static void ScalePlaneDown(int src_width, int src_height,
3269                           int dst_width, int dst_height,
3270                           int src_stride, int dst_stride,
3271                           const uint8* src_ptr, uint8* dst_ptr,
3272                           FilterMode filtering) {
3273  if (!filtering) {
3274    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
3275                     src_stride, dst_stride, src_ptr, dst_ptr);
3276  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
3277    // between 1/2x and 1x use bilinear
3278    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
3279                       src_stride, dst_stride, src_ptr, dst_ptr);
3280  } else {
3281    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
3282                  src_stride, dst_stride, src_ptr, dst_ptr);
3283  }
3284}
3285
3286/**
3287 * Copy plane, no scaling
3288 *
3289 * This simply copies the given plane without scaling.
3290 * The current implementation is ~115 times faster
3291 * compared to the reference implementation.
3292 *
3293 */
3294static void CopyPlane(int src_width, int src_height,
3295                      int dst_width, int dst_height,
3296                      int src_stride, int dst_stride,
3297                      const uint8* src_ptr, uint8* dst_ptr) {
3298  if (src_stride == src_width && dst_stride == dst_width) {
3299    // All contiguous, so can use REALLY fast path.
3300    memcpy(dst_ptr, src_ptr, src_width * src_height);
3301  } else {
3302    // Not all contiguous; must copy scanlines individually
3303    const uint8* src = src_ptr;
3304    uint8* dst = dst_ptr;
3305    for (int i = 0; i < src_height; ++i) {
3306      memcpy(dst, src, src_width);
3307      dst += dst_stride;
3308      src += src_stride;
3309    }
3310  }
3311}
3312
3313static void ScalePlane(const uint8* src, int src_stride,
3314                       int src_width, int src_height,
3315                       uint8* dst, int dst_stride,
3316                       int dst_width, int dst_height,
3317                       FilterMode filtering, bool use_ref) {
3318  // Use specialized scales to improve performance for common resolutions.
3319  // For example, all the 1/2 scalings will use ScalePlaneDown2()
3320  if (dst_width == src_width && dst_height == src_height) {
3321    // Straight copy.
3322    CopyPlane(src_width, src_height, dst_width, dst_height, src_stride,
3323              dst_stride, src, dst);
3324  } else if (dst_width <= src_width && dst_height <= src_height) {
3325    // Scale down.
3326    if (use_ref) {
3327      // For testing, allow the optimized versions to be disabled.
3328      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3329                     src_stride, dst_stride, src, dst, filtering);
3330    } else if (4 * dst_width == 3 * src_width &&
3331               4 * dst_height == 3 * src_height) {
3332      // optimized, 3/4
3333      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
3334                       src_stride, dst_stride, src, dst, filtering);
3335    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
3336      // optimized, 1/2
3337      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
3338                      src_stride, dst_stride, src, dst, filtering);
3339    // 3/8 rounded up for odd sized chroma height.
3340    } else if (8 * dst_width == 3 * src_width &&
3341               dst_height == ((src_height * 3 + 7) / 8)) {
3342      // optimized, 3/8
3343      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
3344                       src_stride, dst_stride, src, dst, filtering);
3345    } else if (4 * dst_width == src_width && 4 * dst_height == src_height) {
3346      // optimized, 1/4
3347      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
3348                      src_stride, dst_stride, src, dst, filtering);
3349    } else if (8 * dst_width == src_width && 8 * dst_height == src_height) {
3350      // optimized, 1/8
3351      ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
3352                      src_stride, dst_stride, src, dst, filtering);
3353    } else {
3354      // Arbitrary downsample
3355      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
3356                     src_stride, dst_stride, src, dst, filtering);
3357    }
3358  } else {
3359    // Arbitrary scale up and/or down.
3360    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
3361                      src_stride, dst_stride, src, dst, filtering);
3362  }
3363}
3364
3365/**
3366 * Scale a plane.
3367 *
3368 * This function in turn calls a scaling function
3369 * suitable for handling the desired resolutions.
3370 *
3371 */
3372
3373int I420Scale(const uint8* src_y, int src_stride_y,
3374              const uint8* src_u, int src_stride_u,
3375              const uint8* src_v, int src_stride_v,
3376              int src_width, int src_height,
3377              uint8* dst_y, int dst_stride_y,
3378              uint8* dst_u, int dst_stride_u,
3379              uint8* dst_v, int dst_stride_v,
3380              int dst_width, int dst_height,
3381              FilterMode filtering) {
3382  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3383      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3384    return -1;
3385  }
3386  // Negative height means invert the image.
3387  if (src_height < 0) {
3388    src_height = -src_height;
3389    int halfheight = (src_height + 1) >> 1;
3390    src_y = src_y + (src_height - 1) * src_stride_y;
3391    src_u = src_u + (halfheight - 1) * src_stride_u;
3392    src_v = src_v + (halfheight - 1) * src_stride_v;
3393    src_stride_y = -src_stride_y;
3394    src_stride_u = -src_stride_u;
3395    src_stride_v = -src_stride_v;
3396  }
3397  int halfsrc_width = (src_width + 1) >> 1;
3398  int halfsrc_height = (src_height + 1) >> 1;
3399  int halfdst_width = (dst_width + 1) >> 1;
3400  int halfoheight = (dst_height + 1) >> 1;
3401
3402  ScalePlane(src_y, src_stride_y, src_width, src_height,
3403             dst_y, dst_stride_y, dst_width, dst_height,
3404             filtering, use_reference_impl_);
3405  ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
3406             dst_u, dst_stride_u, halfdst_width, halfoheight,
3407             filtering, use_reference_impl_);
3408  ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
3409             dst_v, dst_stride_v, halfdst_width, halfoheight,
3410             filtering, use_reference_impl_);
3411  return 0;
3412}
3413
3414int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
3415          int src_stride_y, int src_stride_u, int src_stride_v,
3416          int src_width, int src_height,
3417          uint8* dst_y, uint8* dst_u, uint8* dst_v,
3418          int dst_stride_y, int dst_stride_u, int dst_stride_v,
3419          int dst_width, int dst_height,
3420          bool interpolate) {
3421  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
3422      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
3423    return -1;
3424  }
3425  // Negative height means invert the image.
3426  if (src_height < 0) {
3427    src_height = -src_height;
3428    int halfheight = (src_height + 1) >> 1;
3429    src_y = src_y + (src_height - 1) * src_stride_y;
3430    src_u = src_u + (halfheight - 1) * src_stride_u;
3431    src_v = src_v + (halfheight - 1) * src_stride_v;
3432    src_stride_y = -src_stride_y;
3433    src_stride_u = -src_stride_u;
3434    src_stride_v = -src_stride_v;
3435  }
3436  int halfsrc_width = (src_width + 1) >> 1;
3437  int halfsrc_height = (src_height + 1) >> 1;
3438  int halfdst_width = (dst_width + 1) >> 1;
3439  int halfoheight = (dst_height + 1) >> 1;
3440  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
3441
3442  ScalePlane(src_y, src_stride_y, src_width, src_height,
3443             dst_y, dst_stride_y, dst_width, dst_height,
3444             filtering, use_reference_impl_);
3445  ScalePlane(src_u, src_stride_u, halfsrc_width, halfsrc_height,
3446             dst_u, dst_stride_u, halfdst_width, halfoheight,
3447             filtering, use_reference_impl_);
3448  ScalePlane(src_v, src_stride_v, halfsrc_width, halfsrc_height,
3449             dst_v, dst_stride_v, halfdst_width, halfoheight,
3450             filtering, use_reference_impl_);
3451  return 0;
3452}
3453
3454int Scale(const uint8* src, int src_width, int src_height,
3455          uint8* dst, int dst_width, int dst_height, int ooffset,
3456          bool interpolate) {
3457  if (!src || src_width <= 0 || src_height <= 0 ||
3458      !dst || dst_width <= 0 || dst_height <= 0 || ooffset < 0 ||
3459      ooffset >= dst_height) {
3460    return -1;
3461  }
3462  ooffset = ooffset & ~1;  // chroma requires offset to multiple of 2.
3463  int halfsrc_width = (src_width + 1) >> 1;
3464  int halfsrc_height = (src_height + 1) >> 1;
3465  int halfdst_width = (dst_width + 1) >> 1;
3466  int halfoheight = (dst_height + 1) >> 1;
3467  int aheight = dst_height - ooffset * 2;  // actual output height
3468  const uint8* const iyptr = src;
3469  uint8* oyptr = dst + ooffset * dst_width;
3470  const uint8* const iuptr = src + src_width * src_height;
3471  uint8* ouptr = dst + dst_width * dst_height + (ooffset >> 1) * halfdst_width;
3472  const uint8* const ivptr = src + src_width * src_height +
3473                             halfsrc_width * halfsrc_height;
3474  uint8* ovptr = dst + dst_width * dst_height + halfdst_width * halfoheight +
3475                 (ooffset >> 1) * halfdst_width;
3476  return Scale(iyptr, iuptr, ivptr, src_width, halfsrc_width, halfsrc_width,
3477               src_width, src_height, oyptr, ouptr, ovptr, dst_width,
3478               halfdst_width, halfdst_width, dst_width, aheight, interpolate);
3479}
3480
3481}  // namespace libyuv
3482