1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/scale.h"
12
13#include <assert.h>
14#include <string.h>
15#include <stdlib.h>  // For getenv()
16
17#include "libyuv/cpu_id.h"
18#include "libyuv/planar_functions.h"  // For CopyPlane
19#include "libyuv/row.h"
20
21#ifdef __cplusplus
22namespace libyuv {
23extern "C" {
24#endif
25
26// Bilinear SSE2 is disabled.
27#define SSE2_DISABLED 1
28
29// Note: Some SSE2 reference manuals
30// cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
31
32// Set the following flag to true to revert to only
33// using the reference implementation ScalePlaneBox(), and
34// NOT the optimized versions. Useful for debugging and
35// when comparing the quality of the resulting YUV planes
36// as produced by the optimized and non-optimized versions.
37static bool use_reference_impl_ = false;
38
39LIBYUV_API
40void SetUseReferenceImpl(bool use) {
41  use_reference_impl_ = use;
42}
43
44// ScaleRowDown2Int also used by planar functions
45
46/**
47 * NEON downscalers with interpolation.
48 *
49 * Provided by Fritz Koenig
50 *
51 */
52
53#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
54#define HAS_SCALEROWDOWN2_NEON
55// Note - not static due to reuse in convert for 444 to 420.
56void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
57                        uint8* dst, int dst_width);
58
59void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
60                           uint8* dst, int dst_width);
61
62#define HAS_SCALEROWDOWN4_NEON
63void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */,
64                        uint8* dst_ptr, int dst_width);
65void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
66                           uint8* dst_ptr, int dst_width);
67
68#define HAS_SCALEROWDOWN34_NEON
69// Down scale from 4 to 3 pixels. Use the neon multilane read/write
70//  to load up the every 4th pixel into a 4 different registers.
71// Point samples 32 pixels to 24 pixels.
72void ScaleRowDown34_NEON(const uint8* src_ptr,
73                         ptrdiff_t /* src_stride */,
74                         uint8* dst_ptr, int dst_width);
75void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr,
76                               ptrdiff_t src_stride,
77                               uint8* dst_ptr, int dst_width);
78void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr,
79                               ptrdiff_t src_stride,
80                               uint8* dst_ptr, int dst_width);
81
82#define HAS_SCALEROWDOWN38_NEON
83// 32 -> 12
84void ScaleRowDown38_NEON(const uint8* src_ptr,
85                         ptrdiff_t /* src_stride */,
86                         uint8* dst_ptr, int dst_width);
87// 32x3 -> 12x1
88void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr,
89                               ptrdiff_t src_stride,
90                               uint8* dst_ptr, int dst_width);
91// 32x2 -> 12x1
92void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr,
93                               ptrdiff_t src_stride,
94                               uint8* dst_ptr, int dst_width);
95// 16x2 -> 16x1
96#define HAS_SCALEFILTERROWS_NEON
97void ScaleFilterRows_NEON(uint8* dst_ptr,
98                          const uint8* src_ptr, ptrdiff_t src_stride,
99                          int dst_width, int source_y_fraction);
100
101/**
102 * SSE2 downscalers with interpolation.
103 *
104 * Provided by Frank Barchard (fbarchard@google.com)
105 *
106 */
107
108
109// Constants for SSSE3 code
110#elif !defined(YUV_DISABLE_ASM) && \
111    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
112
113// GCC 4.2 on OSX has link error when passing static or const to inline.
114// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
115#ifdef __APPLE__
116#define CONST
117#else
118#define CONST static const
119#endif
120
121// Offsets for source bytes 0 to 9
122CONST uvec8 kShuf0 =
123  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
124
125// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
126CONST uvec8 kShuf1 =
127  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
128
129// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
130CONST uvec8 kShuf2 =
131  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
132
133// Offsets for source bytes 0 to 10
134CONST uvec8 kShuf01 =
135  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
136
137// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
138CONST uvec8 kShuf11 =
139  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
140
141// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
142CONST uvec8 kShuf21 =
143  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
144
145// Coefficients for source bytes 0 to 10
146CONST uvec8 kMadd01 =
147  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
148
149// Coefficients for source bytes 10 to 21
150CONST uvec8 kMadd11 =
151  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
152
153// Coefficients for source bytes 21 to 31
154CONST uvec8 kMadd21 =
155  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
156
157// Coefficients for source bytes 21 to 31
158CONST vec16 kRound34 =
159  { 2, 2, 2, 2, 2, 2, 2, 2 };
160
161CONST uvec8 kShuf38a =
162  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
163
164CONST uvec8 kShuf38b =
165  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
166
167// Arrange words 0,3,6 into 0,1,2
168CONST uvec8 kShufAc =
169  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
170
171// Arrange words 0,3,6 into 3,4,5
172CONST uvec8 kShufAc3 =
173  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
174
175// Scaling values for boxes of 3x3 and 2x3
176CONST uvec16 kScaleAc33 =
177  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
178
179// Arrange first value for pixels 0,1,2,3,4,5
180CONST uvec8 kShufAb0 =
181  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
182
183// Arrange second value for pixels 0,1,2,3,4,5
184CONST uvec8 kShufAb1 =
185  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
186
187// Arrange third value for pixels 0,1,2,3,4,5
188CONST uvec8 kShufAb2 =
189  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
190
191// Scaling values for boxes of 3x2 and 2x2
192CONST uvec16 kScaleAb2 =
193  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
194#endif
195
196#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
197
198#define HAS_SCALEROWDOWN2_SSE2
199// Reads 32 pixels, throws half away and writes 16 pixels.
200// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
201__declspec(naked) __declspec(align(16))
202static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
203                               uint8* dst_ptr, int dst_width) {
204  __asm {
205    mov        eax, [esp + 4]        // src_ptr
206                                     // src_stride ignored
207    mov        edx, [esp + 12]       // dst_ptr
208    mov        ecx, [esp + 16]       // dst_width
209    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
210    psrlw      xmm5, 8
211
212    align      16
213  wloop:
214    movdqa     xmm0, [eax]
215    movdqa     xmm1, [eax + 16]
216    lea        eax,  [eax + 32]
217    pand       xmm0, xmm5
218    pand       xmm1, xmm5
219    packuswb   xmm0, xmm1
220    sub        ecx, 16
221    movdqa     [edx], xmm0
222    lea        edx, [edx + 16]
223    jg         wloop
224
225    ret
226  }
227}
228// Blends 32x2 rectangle to 16x1.
229// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
230__declspec(naked) __declspec(align(16))
231void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
232                           uint8* dst_ptr, int dst_width) {
233  __asm {
234    push       esi
235    mov        eax, [esp + 4 + 4]    // src_ptr
236    mov        esi, [esp + 4 + 8]    // src_stride
237    mov        edx, [esp + 4 + 12]   // dst_ptr
238    mov        ecx, [esp + 4 + 16]   // dst_width
239    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
240    psrlw      xmm5, 8
241
242    align      16
243  wloop:
244    movdqa     xmm0, [eax]
245    movdqa     xmm1, [eax + 16]
246    movdqa     xmm2, [eax + esi]
247    movdqa     xmm3, [eax + esi + 16]
248    lea        eax,  [eax + 32]
249    pavgb      xmm0, xmm2            // average rows
250    pavgb      xmm1, xmm3
251
252    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
253    psrlw      xmm0, 8
254    movdqa     xmm3, xmm1
255    psrlw      xmm1, 8
256    pand       xmm2, xmm5
257    pand       xmm3, xmm5
258    pavgw      xmm0, xmm2
259    pavgw      xmm1, xmm3
260    packuswb   xmm0, xmm1
261
262    sub        ecx, 16
263    movdqa     [edx], xmm0
264    lea        edx, [edx + 16]
265    jg         wloop
266
267    pop        esi
268    ret
269  }
270}
271
272// Reads 32 pixels, throws half away and writes 16 pixels.
273// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
274__declspec(naked) __declspec(align(16))
275static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
276                                         ptrdiff_t src_stride,
277                                         uint8* dst_ptr, int dst_width) {
278  __asm {
279    mov        eax, [esp + 4]        // src_ptr
280                                     // src_stride ignored
281    mov        edx, [esp + 12]       // dst_ptr
282    mov        ecx, [esp + 16]       // dst_width
283    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
284    psrlw      xmm5, 8
285
286    align      16
287  wloop:
288    movdqu     xmm0, [eax]
289    movdqu     xmm1, [eax + 16]
290    lea        eax,  [eax + 32]
291    pand       xmm0, xmm5
292    pand       xmm1, xmm5
293    packuswb   xmm0, xmm1
294    sub        ecx, 16
295    movdqu     [edx], xmm0
296    lea        edx, [edx + 16]
297    jg         wloop
298
299    ret
300  }
301}
302// Blends 32x2 rectangle to 16x1.
303// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
304__declspec(naked) __declspec(align(16))
305static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
306                                            ptrdiff_t src_stride,
307                                            uint8* dst_ptr, int dst_width) {
308  __asm {
309    push       esi
310    mov        eax, [esp + 4 + 4]    // src_ptr
311    mov        esi, [esp + 4 + 8]    // src_stride
312    mov        edx, [esp + 4 + 12]   // dst_ptr
313    mov        ecx, [esp + 4 + 16]   // dst_width
314    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
315    psrlw      xmm5, 8
316
317    align      16
318  wloop:
319    movdqu     xmm0, [eax]
320    movdqu     xmm1, [eax + 16]
321    movdqu     xmm2, [eax + esi]
322    movdqu     xmm3, [eax + esi + 16]
323    lea        eax,  [eax + 32]
324    pavgb      xmm0, xmm2            // average rows
325    pavgb      xmm1, xmm3
326
327    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
328    psrlw      xmm0, 8
329    movdqa     xmm3, xmm1
330    psrlw      xmm1, 8
331    pand       xmm2, xmm5
332    pand       xmm3, xmm5
333    pavgw      xmm0, xmm2
334    pavgw      xmm1, xmm3
335    packuswb   xmm0, xmm1
336
337    sub        ecx, 16
338    movdqu     [edx], xmm0
339    lea        edx, [edx + 16]
340    jg         wloop
341
342    pop        esi
343    ret
344  }
345}
346
347#define HAS_SCALEROWDOWN4_SSE2
348// Point samples 32 pixels to 8 pixels.
349// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
350__declspec(naked) __declspec(align(16))
351static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
352                               uint8* dst_ptr, int dst_width) {
353  __asm {
354    mov        eax, [esp + 4]        // src_ptr
355                                     // src_stride ignored
356    mov        edx, [esp + 12]       // dst_ptr
357    mov        ecx, [esp + 16]       // dst_width
358    pcmpeqb    xmm5, xmm5            // generate mask 0x000000ff
359    psrld      xmm5, 24
360
361    align      16
362  wloop:
363    movdqa     xmm0, [eax]
364    movdqa     xmm1, [eax + 16]
365    lea        eax,  [eax + 32]
366    pand       xmm0, xmm5
367    pand       xmm1, xmm5
368    packuswb   xmm0, xmm1
369    packuswb   xmm0, xmm0
370    sub        ecx, 8
371    movq       qword ptr [edx], xmm0
372    lea        edx, [edx + 8]
373    jg         wloop
374
375    ret
376  }
377}
378
379// Blends 32x4 rectangle to 8x1.
380// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
381__declspec(naked) __declspec(align(16))
382static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
383                                  uint8* dst_ptr, int dst_width) {
384  __asm {
385    push       esi
386    push       edi
387    mov        eax, [esp + 8 + 4]    // src_ptr
388    mov        esi, [esp + 8 + 8]    // src_stride
389    mov        edx, [esp + 8 + 12]   // dst_ptr
390    mov        ecx, [esp + 8 + 16]   // dst_width
391    lea        edi, [esi + esi * 2]  // src_stride * 3
392    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
393    psrlw      xmm7, 8
394
395    align      16
396  wloop:
397    movdqa     xmm0, [eax]
398    movdqa     xmm1, [eax + 16]
399    movdqa     xmm2, [eax + esi]
400    movdqa     xmm3, [eax + esi + 16]
401    pavgb      xmm0, xmm2            // average rows
402    pavgb      xmm1, xmm3
403    movdqa     xmm2, [eax + esi * 2]
404    movdqa     xmm3, [eax + esi * 2 + 16]
405    movdqa     xmm4, [eax + edi]
406    movdqa     xmm5, [eax + edi + 16]
407    lea        eax, [eax + 32]
408    pavgb      xmm2, xmm4
409    pavgb      xmm3, xmm5
410    pavgb      xmm0, xmm2
411    pavgb      xmm1, xmm3
412
413    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
414    psrlw      xmm0, 8
415    movdqa     xmm3, xmm1
416    psrlw      xmm1, 8
417    pand       xmm2, xmm7
418    pand       xmm3, xmm7
419    pavgw      xmm0, xmm2
420    pavgw      xmm1, xmm3
421    packuswb   xmm0, xmm1
422
423    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
424    psrlw      xmm0, 8
425    pand       xmm2, xmm7
426    pavgw      xmm0, xmm2
427    packuswb   xmm0, xmm0
428
429    sub        ecx, 8
430    movq       qword ptr [edx], xmm0
431    lea        edx, [edx + 8]
432    jg         wloop
433
434    pop        edi
435    pop        esi
436    ret
437  }
438}
439
440#define HAS_SCALEROWDOWN8_SSE2
441// Point samples 32 pixels to 4 pixels.
442// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
443__declspec(naked) __declspec(align(16))
444static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
445                               uint8* dst_ptr, int dst_width) {
446  __asm {
447    mov        eax, [esp + 4]        // src_ptr
448                                     // src_stride ignored
449    mov        edx, [esp + 12]       // dst_ptr
450    mov        ecx, [esp + 16]       // dst_width
451    pcmpeqb    xmm5, xmm5            // generate mask isolating 1 src 8 bytes
452    psrlq      xmm5, 56
453
454    align      16
455  wloop:
456    movdqa     xmm0, [eax]
457    movdqa     xmm1, [eax + 16]
458    lea        eax,  [eax + 32]
459    pand       xmm0, xmm5
460    pand       xmm1, xmm5
461    packuswb   xmm0, xmm1  // 32->16
462    packuswb   xmm0, xmm0  // 16->8
463    packuswb   xmm0, xmm0  // 8->4
464    sub        ecx, 4
465    movd       dword ptr [edx], xmm0
466    lea        edx, [edx + 4]
467    jg         wloop
468
469    ret
470  }
471}
472
473// Blends 32x8 rectangle to 4x1.
474// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
475__declspec(naked) __declspec(align(16))
476static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
477                                  uint8* dst_ptr, int dst_width) {
478  __asm {
479    push       esi
480    push       edi
481    push       ebp
482    mov        eax, [esp + 12 + 4]   // src_ptr
483    mov        esi, [esp + 12 + 8]   // src_stride
484    mov        edx, [esp + 12 + 12]  // dst_ptr
485    mov        ecx, [esp + 12 + 16]  // dst_width
486    lea        edi, [esi + esi * 2]  // src_stride * 3
487    pxor       xmm7, xmm7
488
489    align      16
490  wloop:
491    movdqa     xmm0, [eax]           // average 8 rows to 1
492    movdqa     xmm1, [eax + 16]
493    movdqa     xmm2, [eax + esi]
494    movdqa     xmm3, [eax + esi + 16]
495    pavgb      xmm0, xmm2
496    pavgb      xmm1, xmm3
497    movdqa     xmm2, [eax + esi * 2]
498    movdqa     xmm3, [eax + esi * 2 + 16]
499    movdqa     xmm4, [eax + edi]
500    movdqa     xmm5, [eax + edi + 16]
501    lea        ebp, [eax + esi * 4]
502    lea        eax, [eax + 32]
503    pavgb      xmm2, xmm4
504    pavgb      xmm3, xmm5
505    pavgb      xmm0, xmm2
506    pavgb      xmm1, xmm3
507
508    movdqa     xmm2, [ebp]
509    movdqa     xmm3, [ebp + 16]
510    movdqa     xmm4, [ebp + esi]
511    movdqa     xmm5, [ebp + esi + 16]
512    pavgb      xmm2, xmm4
513    pavgb      xmm3, xmm5
514    movdqa     xmm4, [ebp + esi * 2]
515    movdqa     xmm5, [ebp + esi * 2 + 16]
516    movdqa     xmm6, [ebp + edi]
517    pavgb      xmm4, xmm6
518    movdqa     xmm6, [ebp + edi + 16]
519    pavgb      xmm5, xmm6
520    pavgb      xmm2, xmm4
521    pavgb      xmm3, xmm5
522    pavgb      xmm0, xmm2
523    pavgb      xmm1, xmm3
524
525    psadbw     xmm0, xmm7            // average 32 pixels to 4
526    psadbw     xmm1, xmm7
527    pshufd     xmm0, xmm0, 0xd8      // x1x0 -> xx01
528    pshufd     xmm1, xmm1, 0x8d      // x3x2 -> 32xx
529    por        xmm0, xmm1            //      -> 3201
530    psrlw      xmm0, 3
531    packuswb   xmm0, xmm0
532    packuswb   xmm0, xmm0
533
534    sub        ecx, 4
535    movd       dword ptr [edx], xmm0
536    lea        edx, [edx + 4]
537    jg         wloop
538
539    pop        ebp
540    pop        edi
541    pop        esi
542    ret
543  }
544}
545
546#define HAS_SCALEROWDOWN34_SSSE3
547// Point samples 32 pixels to 24 pixels.
548// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
549// Then shuffled to do the scaling.
550
551// Note that movdqa+palign may be better than movdqu.
552// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
553__declspec(naked) __declspec(align(16))
554static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
555                                 uint8* dst_ptr, int dst_width) {
556  __asm {
557    mov        eax, [esp + 4]        // src_ptr
558                                     // src_stride ignored
559    mov        edx, [esp + 12]       // dst_ptr
560    mov        ecx, [esp + 16]       // dst_width
561    movdqa     xmm3, kShuf0
562    movdqa     xmm4, kShuf1
563    movdqa     xmm5, kShuf2
564
565    align      16
566  wloop:
567    movdqa     xmm0, [eax]
568    movdqa     xmm1, [eax + 16]
569    lea        eax,  [eax + 32]
570    movdqa     xmm2, xmm1
571    palignr    xmm1, xmm0, 8
572    pshufb     xmm0, xmm3
573    pshufb     xmm1, xmm4
574    pshufb     xmm2, xmm5
575    movq       qword ptr [edx], xmm0
576    movq       qword ptr [edx + 8], xmm1
577    movq       qword ptr [edx + 16], xmm2
578    lea        edx, [edx + 24]
579    sub        ecx, 24
580    jg         wloop
581
582    ret
583  }
584}
585
586// Blends 32x2 rectangle to 24x1
587// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
588// Then shuffled to do the scaling.
589
590// Register usage:
591// xmm0 src_row 0
592// xmm1 src_row 1
593// xmm2 shuf 0
594// xmm3 shuf 1
595// xmm4 shuf 2
596// xmm5 madd 0
597// xmm6 madd 1
598// xmm7 kRound34
599
600// Note that movdqa+palign may be better than movdqu.
601// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
602__declspec(naked) __declspec(align(16))
603static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
604                                       ptrdiff_t src_stride,
605                                       uint8* dst_ptr, int dst_width) {
606  __asm {
607    push       esi
608    mov        eax, [esp + 4 + 4]    // src_ptr
609    mov        esi, [esp + 4 + 8]    // src_stride
610    mov        edx, [esp + 4 + 12]   // dst_ptr
611    mov        ecx, [esp + 4 + 16]   // dst_width
612    movdqa     xmm2, kShuf01
613    movdqa     xmm3, kShuf11
614    movdqa     xmm4, kShuf21
615    movdqa     xmm5, kMadd01
616    movdqa     xmm6, kMadd11
617    movdqa     xmm7, kRound34
618
619    align      16
620  wloop:
621    movdqa     xmm0, [eax]           // pixels 0..7
622    movdqa     xmm1, [eax + esi]
623    pavgb      xmm0, xmm1
624    pshufb     xmm0, xmm2
625    pmaddubsw  xmm0, xmm5
626    paddsw     xmm0, xmm7
627    psrlw      xmm0, 2
628    packuswb   xmm0, xmm0
629    movq       qword ptr [edx], xmm0
630    movdqu     xmm0, [eax + 8]       // pixels 8..15
631    movdqu     xmm1, [eax + esi + 8]
632    pavgb      xmm0, xmm1
633    pshufb     xmm0, xmm3
634    pmaddubsw  xmm0, xmm6
635    paddsw     xmm0, xmm7
636    psrlw      xmm0, 2
637    packuswb   xmm0, xmm0
638    movq       qword ptr [edx + 8], xmm0
639    movdqa     xmm0, [eax + 16]      // pixels 16..23
640    movdqa     xmm1, [eax + esi + 16]
641    lea        eax, [eax + 32]
642    pavgb      xmm0, xmm1
643    pshufb     xmm0, xmm4
644    movdqa     xmm1, kMadd21
645    pmaddubsw  xmm0, xmm1
646    paddsw     xmm0, xmm7
647    psrlw      xmm0, 2
648    packuswb   xmm0, xmm0
649    sub        ecx, 24
650    movq       qword ptr [edx + 16], xmm0
651    lea        edx, [edx + 24]
652    jg         wloop
653
654    pop        esi
655    ret
656  }
657}
658
659// Note that movdqa+palign may be better than movdqu.
660// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
661__declspec(naked) __declspec(align(16))
662static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
663                                       ptrdiff_t src_stride,
664                                       uint8* dst_ptr, int dst_width) {
665  __asm {
666    push       esi
667    mov        eax, [esp + 4 + 4]    // src_ptr
668    mov        esi, [esp + 4 + 8]    // src_stride
669    mov        edx, [esp + 4 + 12]   // dst_ptr
670    mov        ecx, [esp + 4 + 16]   // dst_width
671    movdqa     xmm2, kShuf01
672    movdqa     xmm3, kShuf11
673    movdqa     xmm4, kShuf21
674    movdqa     xmm5, kMadd01
675    movdqa     xmm6, kMadd11
676    movdqa     xmm7, kRound34
677
678    align      16
679  wloop:
680    movdqa     xmm0, [eax]           // pixels 0..7
681    movdqa     xmm1, [eax + esi]
682    pavgb      xmm1, xmm0
683    pavgb      xmm0, xmm1
684    pshufb     xmm0, xmm2
685    pmaddubsw  xmm0, xmm5
686    paddsw     xmm0, xmm7
687    psrlw      xmm0, 2
688    packuswb   xmm0, xmm0
689    movq       qword ptr [edx], xmm0
690    movdqu     xmm0, [eax + 8]       // pixels 8..15
691    movdqu     xmm1, [eax + esi + 8]
692    pavgb      xmm1, xmm0
693    pavgb      xmm0, xmm1
694    pshufb     xmm0, xmm3
695    pmaddubsw  xmm0, xmm6
696    paddsw     xmm0, xmm7
697    psrlw      xmm0, 2
698    packuswb   xmm0, xmm0
699    movq       qword ptr [edx + 8], xmm0
700    movdqa     xmm0, [eax + 16]      // pixels 16..23
701    movdqa     xmm1, [eax + esi + 16]
702    lea        eax, [eax + 32]
703    pavgb      xmm1, xmm0
704    pavgb      xmm0, xmm1
705    pshufb     xmm0, xmm4
706    movdqa     xmm1, kMadd21
707    pmaddubsw  xmm0, xmm1
708    paddsw     xmm0, xmm7
709    psrlw      xmm0, 2
710    packuswb   xmm0, xmm0
711    sub        ecx, 24
712    movq       qword ptr [edx + 16], xmm0
713    lea        edx, [edx+24]
714    jg         wloop
715
716    pop        esi
717    ret
718  }
719}
720
721#define HAS_SCALEROWDOWN38_SSSE3
722// 3/8 point sampler
723
724// Scale 32 pixels to 12
725__declspec(naked) __declspec(align(16))
726static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
727                                 uint8* dst_ptr, int dst_width) {
728  __asm {
729    mov        eax, [esp + 4]        // src_ptr
730                                     // src_stride ignored
731    mov        edx, [esp + 12]       // dst_ptr
732    mov        ecx, [esp + 16]       // dst_width
733    movdqa     xmm4, kShuf38a
734    movdqa     xmm5, kShuf38b
735
736    align      16
737  xloop:
738    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
739    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
740    lea        eax, [eax + 32]
741    pshufb     xmm0, xmm4
742    pshufb     xmm1, xmm5
743    paddusb    xmm0, xmm1
744
745    sub        ecx, 12
746    movq       qword ptr [edx], xmm0 // write 12 pixels
747    movhlps    xmm1, xmm0
748    movd       [edx + 8], xmm1
749    lea        edx, [edx + 12]
750    jg         xloop
751
752    ret
753  }
754}
755
756// Scale 16x3 pixels to 6x1 with interpolation
757__declspec(naked) __declspec(align(16))
758static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
759                                       ptrdiff_t src_stride,
760                                       uint8* dst_ptr, int dst_width) {
761  __asm {
762    push       esi
763    mov        eax, [esp + 4 + 4]    // src_ptr
764    mov        esi, [esp + 4 + 8]    // src_stride
765    mov        edx, [esp + 4 + 12]   // dst_ptr
766    mov        ecx, [esp + 4 + 16]   // dst_width
767    movdqa     xmm2, kShufAc
768    movdqa     xmm3, kShufAc3
769    movdqa     xmm4, kScaleAc33
770    pxor       xmm5, xmm5
771
772    align      16
773  xloop:
774    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
775    movdqa     xmm6, [eax + esi]
776    movhlps    xmm1, xmm0
777    movhlps    xmm7, xmm6
778    punpcklbw  xmm0, xmm5
779    punpcklbw  xmm1, xmm5
780    punpcklbw  xmm6, xmm5
781    punpcklbw  xmm7, xmm5
782    paddusw    xmm0, xmm6
783    paddusw    xmm1, xmm7
784    movdqa     xmm6, [eax + esi * 2]
785    lea        eax, [eax + 16]
786    movhlps    xmm7, xmm6
787    punpcklbw  xmm6, xmm5
788    punpcklbw  xmm7, xmm5
789    paddusw    xmm0, xmm6
790    paddusw    xmm1, xmm7
791
792    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
793    psrldq     xmm0, 2
794    paddusw    xmm6, xmm0
795    psrldq     xmm0, 2
796    paddusw    xmm6, xmm0
797    pshufb     xmm6, xmm2
798
799    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
800    psrldq     xmm1, 2
801    paddusw    xmm7, xmm1
802    psrldq     xmm1, 2
803    paddusw    xmm7, xmm1
804    pshufb     xmm7, xmm3
805    paddusw    xmm6, xmm7
806
807    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
808    packuswb   xmm6, xmm6
809
810    sub        ecx, 6
811    movd       [edx], xmm6           // write 6 pixels
812    psrlq      xmm6, 16
813    movd       [edx + 2], xmm6
814    lea        edx, [edx + 6]
815    jg         xloop
816
817    pop        esi
818    ret
819  }
820}
821
822// Scale 16x2 pixels to 6x1 with interpolation
823__declspec(naked) __declspec(align(16))
824static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
825                                       ptrdiff_t src_stride,
826                                       uint8* dst_ptr, int dst_width) {
827  __asm {
828    push       esi
829    mov        eax, [esp + 4 + 4]    // src_ptr
830    mov        esi, [esp + 4 + 8]    // src_stride
831    mov        edx, [esp + 4 + 12]   // dst_ptr
832    mov        ecx, [esp + 4 + 16]   // dst_width
833    movdqa     xmm2, kShufAb0
834    movdqa     xmm3, kShufAb1
835    movdqa     xmm4, kShufAb2
836    movdqa     xmm5, kScaleAb2
837
838    align      16
839  xloop:
840    movdqa     xmm0, [eax]           // average 2 rows into xmm0
841    pavgb      xmm0, [eax + esi]
842    lea        eax, [eax + 16]
843
844    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
845    pshufb     xmm1, xmm2
846    movdqa     xmm6, xmm0
847    pshufb     xmm6, xmm3
848    paddusw    xmm1, xmm6
849    pshufb     xmm0, xmm4
850    paddusw    xmm1, xmm0
851
852    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
853    packuswb   xmm1, xmm1
854
855    sub        ecx, 6
856    movd       [edx], xmm1           // write 6 pixels
857    psrlq      xmm1, 16
858    movd       [edx + 2], xmm1
859    lea        edx, [edx + 6]
860    jg         xloop
861
862    pop        esi
863    ret
864  }
865}
866
867#define HAS_SCALEADDROWS_SSE2
868
869// Reads 16xN bytes and produces 16 shorts at a time.
870__declspec(naked) __declspec(align(16))
871static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
872                              uint16* dst_ptr, int src_width,
873                              int src_height) {
874  __asm {
875    push       esi
876    push       edi
877    push       ebx
878    push       ebp
879    mov        esi, [esp + 16 + 4]   // src_ptr
880    mov        edx, [esp + 16 + 8]   // src_stride
881    mov        edi, [esp + 16 + 12]  // dst_ptr
882    mov        ecx, [esp + 16 + 16]  // dst_width
883    mov        ebx, [esp + 16 + 20]  // height
884    pxor       xmm4, xmm4
885    dec        ebx
886
887    align      16
888  xloop:
889    // first row
890    movdqa     xmm0, [esi]
891    lea        eax, [esi + edx]
892    movdqa     xmm1, xmm0
893    punpcklbw  xmm0, xmm4
894    punpckhbw  xmm1, xmm4
895    lea        esi, [esi + 16]
896    mov        ebp, ebx
897    test       ebp, ebp
898    je         ydone
899
900    // sum remaining rows
901    align      16
902  yloop:
903    movdqa     xmm2, [eax]       // read 16 pixels
904    lea        eax, [eax + edx]  // advance to next row
905    movdqa     xmm3, xmm2
906    punpcklbw  xmm2, xmm4
907    punpckhbw  xmm3, xmm4
908    paddusw    xmm0, xmm2        // sum 16 words
909    paddusw    xmm1, xmm3
910    sub        ebp, 1
911    jg         yloop
912  ydone:
913    movdqa     [edi], xmm0
914    movdqa     [edi + 16], xmm1
915    lea        edi, [edi + 32]
916
917    sub        ecx, 16
918    jg         xloop
919
920    pop        ebp
921    pop        ebx
922    pop        edi
923    pop        esi
924    ret
925  }
926}
927
928#ifndef SSE2_DISABLED
929// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
930// Normal formula for bilinear interpolation is:
931//   source_y_fraction * row1 + (1 - source_y_fraction) row0
932// SSE2 version using the a single multiply of difference:
933//   source_y_fraction * (row1 - row0) + row0
934#define HAS_SCALEFILTERROWS_SSE2_DISABLED
935__declspec(naked) __declspec(align(16))
936static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
937                                 ptrdiff_t src_stride, int dst_width,
938                                 int source_y_fraction) {
939  __asm {
940    push       esi
941    push       edi
942    mov        edi, [esp + 8 + 4]   // dst_ptr
943    mov        esi, [esp + 8 + 8]   // src_ptr
944    mov        edx, [esp + 8 + 12]  // src_stride
945    mov        ecx, [esp + 8 + 16]  // dst_width
946    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
947    sub        edi, esi
948    cmp        eax, 0
949    je         xloop1
950    cmp        eax, 128
951    je         xloop2
952
953    movd       xmm5, eax            // xmm5 = y fraction
954    punpcklbw  xmm5, xmm5
955    punpcklwd  xmm5, xmm5
956    pshufd     xmm5, xmm5, 0
957    pxor       xmm4, xmm4
958
959    align      16
960  xloop:
961    movdqa     xmm0, [esi]  // row0
962    movdqa     xmm2, [esi + edx]  // row1
963    movdqa     xmm1, xmm0
964    movdqa     xmm3, xmm2
965    punpcklbw  xmm2, xmm4
966    punpckhbw  xmm3, xmm4
967    punpcklbw  xmm0, xmm4
968    punpckhbw  xmm1, xmm4
969    psubw      xmm2, xmm0  // row1 - row0
970    psubw      xmm3, xmm1
971    pmulhw     xmm2, xmm5  // scale diff
972    pmulhw     xmm3, xmm5
973    paddw      xmm0, xmm2  // sum rows
974    paddw      xmm1, xmm3
975    packuswb   xmm0, xmm1
976    sub        ecx, 16
977    movdqa     [esi + edi], xmm0
978    lea        esi, [esi + 16]
979    jg         xloop
980
981    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
982    pshufhw    xmm0, xmm0, 0xff
983    punpckhqdq xmm0, xmm0
984    movdqa     [esi + edi], xmm0
985    pop        edi
986    pop        esi
987    ret
988
989    align      16
990  xloop1:
991    movdqa     xmm0, [esi]
992    sub        ecx, 16
993    movdqa     [esi + edi], xmm0
994    lea        esi, [esi + 16]
995    jg         xloop1
996
997    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
998    pshufhw    xmm0, xmm0, 0xff
999    punpckhqdq xmm0, xmm0
1000    movdqa     [esi + edi], xmm0
1001    pop        edi
1002    pop        esi
1003    ret
1004
1005    align      16
1006  xloop2:
1007    movdqa     xmm0, [esi]
1008    pavgb      xmm0, [esi + edx]
1009    sub        ecx, 16
1010    movdqa     [esi + edi], xmm0
1011    lea        esi, [esi + 16]
1012    jg         xloop2
1013
1014    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
1015    pshufhw    xmm0, xmm0, 0xff
1016    punpckhqdq xmm0, xmm0
1017    movdqa     [esi + edi], xmm0
1018    pop        edi
1019    pop        esi
1020    ret
1021  }
1022}
1023#endif  // SSE2_DISABLED
1024// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
1025#define HAS_SCALEFILTERROWS_SSSE3
1026__declspec(naked) __declspec(align(16))
1027static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
1028                                  ptrdiff_t src_stride, int dst_width,
1029                                  int source_y_fraction) {
1030  __asm {
1031    push       esi
1032    push       edi
1033    mov        edi, [esp + 8 + 4]   // dst_ptr
1034    mov        esi, [esp + 8 + 8]   // src_ptr
1035    mov        edx, [esp + 8 + 12]  // src_stride
1036    mov        ecx, [esp + 8 + 16]  // dst_width
1037    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
1038    sub        edi, esi
1039    shr        eax, 1
1040    cmp        eax, 0
1041    je         xloop1
1042    cmp        eax, 64
1043    je         xloop2
1044    movd       xmm0, eax  // high fraction 0..127
1045    neg        eax
1046    add        eax, 128
1047    movd       xmm5, eax  // low fraction 128..1
1048    punpcklbw  xmm5, xmm0
1049    punpcklwd  xmm5, xmm5
1050    pshufd     xmm5, xmm5, 0
1051
1052    align      16
1053  xloop:
1054    movdqa     xmm0, [esi]
1055    movdqa     xmm2, [esi + edx]
1056    movdqa     xmm1, xmm0
1057    punpcklbw  xmm0, xmm2
1058    punpckhbw  xmm1, xmm2
1059    pmaddubsw  xmm0, xmm5
1060    pmaddubsw  xmm1, xmm5
1061    psrlw      xmm0, 7
1062    psrlw      xmm1, 7
1063    packuswb   xmm0, xmm1
1064    sub        ecx, 16
1065    movdqa     [esi + edi], xmm0
1066    lea        esi, [esi + 16]
1067    jg         xloop
1068
1069    punpckhbw  xmm0, xmm0           // duplicate last pixel for filtering
1070    pshufhw    xmm0, xmm0, 0xff
1071    punpckhqdq xmm0, xmm0
1072    movdqa     [esi + edi], xmm0
1073
1074    pop        edi
1075    pop        esi
1076    ret
1077
1078    align      16
1079  xloop1:
1080    movdqa     xmm0, [esi]
1081    sub        ecx, 16
1082    movdqa     [esi + edi], xmm0
1083    lea        esi, [esi + 16]
1084    jg         xloop1
1085
1086    punpckhbw  xmm0, xmm0
1087    pshufhw    xmm0, xmm0, 0xff
1088    punpckhqdq xmm0, xmm0
1089    movdqa     [esi + edi], xmm0
1090    pop        edi
1091    pop        esi
1092    ret
1093
1094    align      16
1095  xloop2:
1096    movdqa     xmm0, [esi]
1097    pavgb      xmm0, [esi + edx]
1098    sub        ecx, 16
1099    movdqa     [esi + edi], xmm0
1100    lea        esi, [esi + 16]
1101    jg         xloop2
1102
1103    punpckhbw  xmm0, xmm0
1104    pshufhw    xmm0, xmm0, 0xff
1105    punpckhqdq xmm0, xmm0
1106    movdqa     [esi + edi], xmm0
1107    pop        edi
1108    pop        esi
1109    ret
1110  }
1111}
1112
1113#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
1114
1115// GCC versions of row functions are verbatim conversions from Visual C.
1116// Generated using gcc disassembly on Visual C object file:
1117// objdump -D yuvscaler.obj >yuvscaler.txt
1118#define HAS_SCALEROWDOWN2_SSE2
1119static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1120                               uint8* dst_ptr, int dst_width) {
1121  asm volatile (
1122    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1123    "psrlw     $0x8,%%xmm5                     \n"
1124    ".p2align  4                               \n"
1125  "1:                                          \n"
1126    "movdqa    (%0),%%xmm0                     \n"
1127    "movdqa    0x10(%0),%%xmm1                 \n"
1128    "lea       0x20(%0),%0                     \n"
1129    "pand      %%xmm5,%%xmm0                   \n"
1130    "pand      %%xmm5,%%xmm1                   \n"
1131    "packuswb  %%xmm1,%%xmm0                   \n"
1132    "movdqa    %%xmm0,(%1)                     \n"
1133    "lea       0x10(%1),%1                     \n"
1134    "sub       $0x10,%2                        \n"
1135    "jg        1b                              \n"
1136  : "+r"(src_ptr),    // %0
1137    "+r"(dst_ptr),    // %1
1138    "+r"(dst_width)   // %2
1139  :
1140  : "memory", "cc"
1141#if defined(__SSE2__)
1142    , "xmm0", "xmm1", "xmm5"
1143#endif
1144  );
1145}
1146
1147void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1148                           uint8* dst_ptr, int dst_width) {
1149  asm volatile (
1150    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1151    "psrlw     $0x8,%%xmm5                     \n"
1152    ".p2align  4                               \n"
1153  "1:                                          \n"
1154    "movdqa    (%0),%%xmm0                     \n"
1155    "movdqa    0x10(%0),%%xmm1                 \n"
1156    "movdqa    (%0,%3,1),%%xmm2                \n"
1157    "movdqa    0x10(%0,%3,1),%%xmm3            \n"
1158    "lea       0x20(%0),%0                     \n"
1159    "pavgb     %%xmm2,%%xmm0                   \n"
1160    "pavgb     %%xmm3,%%xmm1                   \n"
1161    "movdqa    %%xmm0,%%xmm2                   \n"
1162    "psrlw     $0x8,%%xmm0                     \n"
1163    "movdqa    %%xmm1,%%xmm3                   \n"
1164    "psrlw     $0x8,%%xmm1                     \n"
1165    "pand      %%xmm5,%%xmm2                   \n"
1166    "pand      %%xmm5,%%xmm3                   \n"
1167    "pavgw     %%xmm2,%%xmm0                   \n"
1168    "pavgw     %%xmm3,%%xmm1                   \n"
1169    "packuswb  %%xmm1,%%xmm0                   \n"
1170    "movdqa    %%xmm0,(%1)                     \n"
1171    "lea       0x10(%1),%1                     \n"
1172    "sub       $0x10,%2                        \n"
1173    "jg        1b                              \n"
1174  : "+r"(src_ptr),    // %0
1175    "+r"(dst_ptr),    // %1
1176    "+r"(dst_width)   // %2
1177  : "r"(static_cast<intptr_t>(src_stride))   // %3
1178  : "memory", "cc"
1179#if defined(__SSE2__)
1180    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1181#endif
1182  );
1183}
1184static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
1185                                         ptrdiff_t src_stride,
1186                                         uint8* dst_ptr, int dst_width) {
1187  asm volatile (
1188    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1189    "psrlw     $0x8,%%xmm5                     \n"
1190    ".p2align  4                               \n"
1191  "1:                                          \n"
1192    "movdqu    (%0),%%xmm0                     \n"
1193    "movdqu    0x10(%0),%%xmm1                 \n"
1194    "lea       0x20(%0),%0                     \n"
1195    "pand      %%xmm5,%%xmm0                   \n"
1196    "pand      %%xmm5,%%xmm1                   \n"
1197    "packuswb  %%xmm1,%%xmm0                   \n"
1198    "movdqu    %%xmm0,(%1)                     \n"
1199    "lea       0x10(%1),%1                     \n"
1200    "sub       $0x10,%2                        \n"
1201    "jg        1b                              \n"
1202  : "+r"(src_ptr),    // %0
1203    "+r"(dst_ptr),    // %1
1204    "+r"(dst_width)   // %2
1205  :
1206  : "memory", "cc"
1207#if defined(__SSE2__)
1208    , "xmm0", "xmm1", "xmm5"
1209#endif
1210  );
1211}
1212
1213static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr,
1214                                            ptrdiff_t src_stride,
1215                                            uint8* dst_ptr, int dst_width) {
1216  asm volatile (
1217    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1218    "psrlw     $0x8,%%xmm5                     \n"
1219    ".p2align  4                               \n"
1220  "1:                                          \n"
1221    "movdqu    (%0),%%xmm0                     \n"
1222    "movdqu    0x10(%0),%%xmm1                 \n"
1223    "movdqu    (%0,%3,1),%%xmm2                \n"
1224    "movdqu    0x10(%0,%3,1),%%xmm3            \n"
1225    "lea       0x20(%0),%0                     \n"
1226    "pavgb     %%xmm2,%%xmm0                   \n"
1227    "pavgb     %%xmm3,%%xmm1                   \n"
1228    "movdqa    %%xmm0,%%xmm2                   \n"
1229    "psrlw     $0x8,%%xmm0                     \n"
1230    "movdqa    %%xmm1,%%xmm3                   \n"
1231    "psrlw     $0x8,%%xmm1                     \n"
1232    "pand      %%xmm5,%%xmm2                   \n"
1233    "pand      %%xmm5,%%xmm3                   \n"
1234    "pavgw     %%xmm2,%%xmm0                   \n"
1235    "pavgw     %%xmm3,%%xmm1                   \n"
1236    "packuswb  %%xmm1,%%xmm0                   \n"
1237    "movdqu    %%xmm0,(%1)                     \n"
1238    "lea       0x10(%1),%1                     \n"
1239    "sub       $0x10,%2                        \n"
1240    "jg        1b                              \n"
1241  : "+r"(src_ptr),    // %0
1242    "+r"(dst_ptr),    // %1
1243    "+r"(dst_width)   // %2
1244  : "r"(static_cast<intptr_t>(src_stride))   // %3
1245  : "memory", "cc"
1246#if defined(__SSE2__)
1247    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
1248#endif
1249  );
1250}
1251
1252#define HAS_SCALEROWDOWN4_SSE2
1253static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1254                               uint8* dst_ptr, int dst_width) {
1255  asm volatile (
1256    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1257    "psrld     $0x18,%%xmm5                    \n"
1258    ".p2align  4                               \n"
1259  "1:                                          \n"
1260    "movdqa    (%0),%%xmm0                     \n"
1261    "movdqa    0x10(%0),%%xmm1                 \n"
1262    "lea       0x20(%0),%0                     \n"
1263    "pand      %%xmm5,%%xmm0                   \n"
1264    "pand      %%xmm5,%%xmm1                   \n"
1265    "packuswb  %%xmm1,%%xmm0                   \n"
1266    "packuswb  %%xmm0,%%xmm0                   \n"
1267    "movq      %%xmm0,(%1)                     \n"
1268    "lea       0x8(%1),%1                      \n"
1269    "sub       $0x8,%2                         \n"
1270    "jg        1b                              \n"
1271  : "+r"(src_ptr),    // %0
1272    "+r"(dst_ptr),    // %1
1273    "+r"(dst_width)   // %2
1274  :
1275  : "memory", "cc"
1276#if defined(__SSE2__)
1277    , "xmm0", "xmm1", "xmm5"
1278#endif
1279  );
1280}
1281
1282static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1283                                  uint8* dst_ptr, int dst_width) {
1284  intptr_t stridex3 = 0;
1285  asm volatile (
1286    "pcmpeqb   %%xmm7,%%xmm7                   \n"
1287    "psrlw     $0x8,%%xmm7                     \n"
1288    "lea       (%4,%4,2),%3                    \n"
1289    ".p2align  4                               \n"
1290  "1:                                          \n"
1291    "movdqa    (%0),%%xmm0                     \n"
1292    "movdqa    0x10(%0),%%xmm1                 \n"
1293    "movdqa    (%0,%4,1),%%xmm2                \n"
1294    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
1295    "pavgb     %%xmm2,%%xmm0                   \n"
1296    "pavgb     %%xmm3,%%xmm1                   \n"
1297    "movdqa    (%0,%4,2),%%xmm2                \n"
1298    "movdqa    0x10(%0,%4,2),%%xmm3            \n"
1299    "movdqa    (%0,%3,1),%%xmm4                \n"
1300    "movdqa    0x10(%0,%3,1),%%xmm5            \n"
1301    "lea       0x20(%0),%0                     \n"
1302    "pavgb     %%xmm4,%%xmm2                   \n"
1303    "pavgb     %%xmm2,%%xmm0                   \n"
1304    "pavgb     %%xmm5,%%xmm3                   \n"
1305    "pavgb     %%xmm3,%%xmm1                   \n"
1306    "movdqa    %%xmm0,%%xmm2                   \n"
1307    "psrlw     $0x8,%%xmm0                     \n"
1308    "movdqa    %%xmm1,%%xmm3                   \n"
1309    "psrlw     $0x8,%%xmm1                     \n"
1310    "pand      %%xmm7,%%xmm2                   \n"
1311    "pand      %%xmm7,%%xmm3                   \n"
1312    "pavgw     %%xmm2,%%xmm0                   \n"
1313    "pavgw     %%xmm3,%%xmm1                   \n"
1314    "packuswb  %%xmm1,%%xmm0                   \n"
1315    "movdqa    %%xmm0,%%xmm2                   \n"
1316    "psrlw     $0x8,%%xmm0                     \n"
1317    "pand      %%xmm7,%%xmm2                   \n"
1318    "pavgw     %%xmm2,%%xmm0                   \n"
1319    "packuswb  %%xmm0,%%xmm0                   \n"
1320    "movq      %%xmm0,(%1)                     \n"
1321    "lea       0x8(%1),%1                      \n"
1322    "sub       $0x8,%2                         \n"
1323    "jg        1b                              \n"
1324  : "+r"(src_ptr),     // %0
1325    "+r"(dst_ptr),     // %1
1326    "+r"(dst_width),   // %2
1327    "+r"(stridex3)     // %3
1328  : "r"(static_cast<intptr_t>(src_stride))    // %4
1329  : "memory", "cc"
1330#if defined(__SSE2__)
1331    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
1332#endif
1333  );
1334}
1335
1336#define HAS_SCALEROWDOWN8_SSE2
1337static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1338                               uint8* dst_ptr, int dst_width) {
1339  asm volatile (
1340    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1341    "psrlq     $0x38,%%xmm5                    \n"
1342    ".p2align  4                               \n"
1343  "1:                                          \n"
1344    "movdqa    (%0),%%xmm0                     \n"
1345    "movdqa    0x10(%0),%%xmm1                 \n"
1346    "lea       0x20(%0),%0                     \n"
1347    "pand      %%xmm5,%%xmm0                   \n"
1348    "pand      %%xmm5,%%xmm1                   \n"
1349    "packuswb  %%xmm1,%%xmm0                   \n"
1350    "packuswb  %%xmm0,%%xmm0                   \n"
1351    "packuswb  %%xmm0,%%xmm0                   \n"
1352    "movd      %%xmm0,(%1)                     \n"
1353    "lea       0x4(%1),%1                      \n"
1354    "sub       $0x4,%2                         \n"
1355    "jg        1b                              \n"
1356  : "+r"(src_ptr),    // %0
1357    "+r"(dst_ptr),    // %1
1358    "+r"(dst_width)   // %2
1359  :
1360  : "memory", "cc"
1361#if defined(__SSE2__)
1362    , "xmm0", "xmm1", "xmm5"
1363#endif
1364  );
1365}
1366
1367static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1368                                  uint8* dst_ptr, int dst_width) {
1369  intptr_t stridex3 = 0;
1370  intptr_t row4 = 0;
1371  asm volatile (
1372    "lea       (%5,%5,2),%3                    \n"
1373    "pxor      %%xmm7,%%xmm7                   \n"
1374    ".p2align  4                               \n"
1375  "1:                                          \n"
1376    "movdqa    (%0),%%xmm0                     \n"
1377    "movdqa    0x10(%0),%%xmm1                 \n"
1378    "movdqa    (%0,%5,1),%%xmm2                \n"
1379    "movdqa    0x10(%0,%5,1),%%xmm3            \n"
1380    "pavgb     %%xmm2,%%xmm0                   \n"
1381    "pavgb     %%xmm3,%%xmm1                   \n"
1382    "movdqa    (%0,%5,2),%%xmm2                \n"
1383    "movdqa    0x10(%0,%5,2),%%xmm3            \n"
1384    "movdqa    (%0,%3,1),%%xmm4                \n"
1385    "movdqa    0x10(%0,%3,1),%%xmm5            \n"
1386    "lea       (%0,%5,4),%4                    \n"
1387    "lea       0x20(%0),%0                     \n"
1388    "pavgb     %%xmm4,%%xmm2                   \n"
1389    "pavgb     %%xmm5,%%xmm3                   \n"
1390    "pavgb     %%xmm2,%%xmm0                   \n"
1391    "pavgb     %%xmm3,%%xmm1                   \n"
1392    "movdqa    0x0(%4),%%xmm2                  \n"
1393    "movdqa    0x10(%4),%%xmm3                 \n"
1394    "movdqa    0x0(%4,%5,1),%%xmm4             \n"
1395    "movdqa    0x10(%4,%5,1),%%xmm5            \n"
1396    "pavgb     %%xmm4,%%xmm2                   \n"
1397    "pavgb     %%xmm5,%%xmm3                   \n"
1398    "movdqa    0x0(%4,%5,2),%%xmm4             \n"
1399    "movdqa    0x10(%4,%5,2),%%xmm5            \n"
1400    "movdqa    0x0(%4,%3,1),%%xmm6             \n"
1401    "pavgb     %%xmm6,%%xmm4                   \n"
1402    "movdqa    0x10(%4,%3,1),%%xmm6            \n"
1403    "pavgb     %%xmm6,%%xmm5                   \n"
1404    "pavgb     %%xmm4,%%xmm2                   \n"
1405    "pavgb     %%xmm5,%%xmm3                   \n"
1406    "pavgb     %%xmm2,%%xmm0                   \n"
1407    "pavgb     %%xmm3,%%xmm1                   \n"
1408    "psadbw    %%xmm7,%%xmm0                   \n"
1409    "psadbw    %%xmm7,%%xmm1                   \n"
1410    "pshufd    $0xd8,%%xmm0,%%xmm0             \n"
1411    "pshufd    $0x8d,%%xmm1,%%xmm1             \n"
1412    "por       %%xmm1,%%xmm0                   \n"
1413    "psrlw     $0x3,%%xmm0                     \n"
1414    "packuswb  %%xmm0,%%xmm0                   \n"
1415    "packuswb  %%xmm0,%%xmm0                   \n"
1416    "movd      %%xmm0,(%1)                     \n"
1417    "lea       0x4(%1),%1                      \n"
1418    "sub       $0x4,%2                         \n"
1419    "jg        1b                              \n"
1420  : "+r"(src_ptr),     // %0
1421    "+r"(dst_ptr),     // %1
1422    "+rm"(dst_width),  // %2
1423    "+r"(stridex3),    // %3
1424    "+r"(row4)         // %4
1425  : "r"(static_cast<intptr_t>(src_stride))  // %5
1426  : "memory", "cc"
1427#if defined(__SSE2__)
1428    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1429#endif
1430  );
1431}
1432
1433#define HAS_SCALEROWDOWN34_SSSE3
1434static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
1435                                 uint8* dst_ptr, int dst_width) {
1436  asm volatile (
1437    "movdqa    %0,%%xmm3                       \n"
1438    "movdqa    %1,%%xmm4                       \n"
1439    "movdqa    %2,%%xmm5                       \n"
1440  :
1441  : "m"(kShuf0),  // %0
1442    "m"(kShuf1),  // %1
1443    "m"(kShuf2)   // %2
1444  );
1445  asm volatile (
1446    ".p2align  4                               \n"
1447  "1:                                          \n"
1448    "movdqa    (%0),%%xmm0                     \n"
1449    "movdqa    0x10(%0),%%xmm2                 \n"
1450    "lea       0x20(%0),%0                     \n"
1451    "movdqa    %%xmm2,%%xmm1                   \n"
1452    "palignr   $0x8,%%xmm0,%%xmm1              \n"
1453    "pshufb    %%xmm3,%%xmm0                   \n"
1454    "pshufb    %%xmm4,%%xmm1                   \n"
1455    "pshufb    %%xmm5,%%xmm2                   \n"
1456    "movq      %%xmm0,(%1)                     \n"
1457    "movq      %%xmm1,0x8(%1)                  \n"
1458    "movq      %%xmm2,0x10(%1)                 \n"
1459    "lea       0x18(%1),%1                     \n"
1460    "sub       $0x18,%2                        \n"
1461    "jg        1b                              \n"
1462  : "+r"(src_ptr),   // %0
1463    "+r"(dst_ptr),   // %1
1464    "+r"(dst_width)  // %2
1465  :
1466  : "memory", "cc"
1467#if defined(__SSE2__)
1468    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1469#endif
1470  );
1471}
1472
1473static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr,
1474                                       ptrdiff_t src_stride,
1475                                       uint8* dst_ptr, int dst_width) {
1476  asm volatile (
1477    "movdqa    %0,%%xmm2                       \n"  // kShuf01
1478    "movdqa    %1,%%xmm3                       \n"  // kShuf11
1479    "movdqa    %2,%%xmm4                       \n"  // kShuf21
1480  :
1481  : "m"(kShuf01),  // %0
1482    "m"(kShuf11),  // %1
1483    "m"(kShuf21)   // %2
1484  );
1485  asm volatile (
1486    "movdqa    %0,%%xmm5                       \n"  // kMadd01
1487    "movdqa    %1,%%xmm0                       \n"  // kMadd11
1488    "movdqa    %2,%%xmm1                       \n"  // kRound34
1489  :
1490  : "m"(kMadd01),  // %0
1491    "m"(kMadd11),  // %1
1492    "m"(kRound34)  // %2
1493  );
1494  asm volatile (
1495    ".p2align  4                               \n"
1496  "1:                                          \n"
1497    "movdqa    (%0),%%xmm6                     \n"
1498    "movdqa    (%0,%3),%%xmm7                  \n"
1499    "pavgb     %%xmm7,%%xmm6                   \n"
1500    "pshufb    %%xmm2,%%xmm6                   \n"
1501    "pmaddubsw %%xmm5,%%xmm6                   \n"
1502    "paddsw    %%xmm1,%%xmm6                   \n"
1503    "psrlw     $0x2,%%xmm6                     \n"
1504    "packuswb  %%xmm6,%%xmm6                   \n"
1505    "movq      %%xmm6,(%1)                     \n"
1506    "movdqu    0x8(%0),%%xmm6                  \n"
1507    "movdqu    0x8(%0,%3),%%xmm7               \n"
1508    "pavgb     %%xmm7,%%xmm6                   \n"
1509    "pshufb    %%xmm3,%%xmm6                   \n"
1510    "pmaddubsw %%xmm0,%%xmm6                   \n"
1511    "paddsw    %%xmm1,%%xmm6                   \n"
1512    "psrlw     $0x2,%%xmm6                     \n"
1513    "packuswb  %%xmm6,%%xmm6                   \n"
1514    "movq      %%xmm6,0x8(%1)                  \n"
1515    "movdqa    0x10(%0),%%xmm6                 \n"
1516    "movdqa    0x10(%0,%3),%%xmm7              \n"
1517    "lea       0x20(%0),%0                     \n"
1518    "pavgb     %%xmm7,%%xmm6                   \n"
1519    "pshufb    %%xmm4,%%xmm6                   \n"
1520    "pmaddubsw %4,%%xmm6                       \n"
1521    "paddsw    %%xmm1,%%xmm6                   \n"
1522    "psrlw     $0x2,%%xmm6                     \n"
1523    "packuswb  %%xmm6,%%xmm6                   \n"
1524    "movq      %%xmm6,0x10(%1)                 \n"
1525    "lea       0x18(%1),%1                     \n"
1526    "sub       $0x18,%2                        \n"
1527    "jg        1b                              \n"
1528  : "+r"(src_ptr),   // %0
1529    "+r"(dst_ptr),   // %1
1530    "+r"(dst_width)  // %2
1531  : "r"(static_cast<intptr_t>(src_stride)),  // %3
1532    "m"(kMadd21)     // %4
1533  : "memory", "cc"
1534#if defined(__SSE2__)
1535    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1536#endif
1537  );
1538}
1539
1540static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr,
1541                                       ptrdiff_t src_stride,
1542                                       uint8* dst_ptr, int dst_width) {
1543  asm volatile (
1544    "movdqa    %0,%%xmm2                       \n"  // kShuf01
1545    "movdqa    %1,%%xmm3                       \n"  // kShuf11
1546    "movdqa    %2,%%xmm4                       \n"  // kShuf21
1547  :
1548  : "m"(kShuf01),  // %0
1549    "m"(kShuf11),  // %1
1550    "m"(kShuf21)   // %2
1551  );
1552  asm volatile (
1553    "movdqa    %0,%%xmm5                       \n"  // kMadd01
1554    "movdqa    %1,%%xmm0                       \n"  // kMadd11
1555    "movdqa    %2,%%xmm1                       \n"  // kRound34
1556  :
1557  : "m"(kMadd01),  // %0
1558    "m"(kMadd11),  // %1
1559    "m"(kRound34)  // %2
1560  );
1561
1562  asm volatile (
1563    ".p2align  4                               \n"
1564  "1:                                          \n"
1565    "movdqa    (%0),%%xmm6                     \n"
1566    "movdqa    (%0,%3,1),%%xmm7                \n"
1567    "pavgb     %%xmm6,%%xmm7                   \n"
1568    "pavgb     %%xmm7,%%xmm6                   \n"
1569    "pshufb    %%xmm2,%%xmm6                   \n"
1570    "pmaddubsw %%xmm5,%%xmm6                   \n"
1571    "paddsw    %%xmm1,%%xmm6                   \n"
1572    "psrlw     $0x2,%%xmm6                     \n"
1573    "packuswb  %%xmm6,%%xmm6                   \n"
1574    "movq      %%xmm6,(%1)                     \n"
1575    "movdqu    0x8(%0),%%xmm6                  \n"
1576    "movdqu    0x8(%0,%3,1),%%xmm7             \n"
1577    "pavgb     %%xmm6,%%xmm7                   \n"
1578    "pavgb     %%xmm7,%%xmm6                   \n"
1579    "pshufb    %%xmm3,%%xmm6                   \n"
1580    "pmaddubsw %%xmm0,%%xmm6                   \n"
1581    "paddsw    %%xmm1,%%xmm6                   \n"
1582    "psrlw     $0x2,%%xmm6                     \n"
1583    "packuswb  %%xmm6,%%xmm6                   \n"
1584    "movq      %%xmm6,0x8(%1)                  \n"
1585    "movdqa    0x10(%0),%%xmm6                 \n"
1586    "movdqa    0x10(%0,%3,1),%%xmm7            \n"
1587    "lea       0x20(%0),%0                     \n"
1588    "pavgb     %%xmm6,%%xmm7                   \n"
1589    "pavgb     %%xmm7,%%xmm6                   \n"
1590    "pshufb    %%xmm4,%%xmm6                   \n"
1591    "pmaddubsw %4,%%xmm6                       \n"
1592    "paddsw    %%xmm1,%%xmm6                   \n"
1593    "psrlw     $0x2,%%xmm6                     \n"
1594    "packuswb  %%xmm6,%%xmm6                   \n"
1595    "movq      %%xmm6,0x10(%1)                 \n"
1596    "lea       0x18(%1),%1                     \n"
1597    "sub       $0x18,%2                        \n"
1598    "jg        1b                              \n"
1599    : "+r"(src_ptr),   // %0
1600      "+r"(dst_ptr),   // %1
1601      "+r"(dst_width)  // %2
1602    : "r"(static_cast<intptr_t>(src_stride)),  // %3
1603      "m"(kMadd21)     // %4
1604    : "memory", "cc"
1605#if defined(__SSE2__)
1606    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1607#endif
1608  );
1609}
1610
1611#define HAS_SCALEROWDOWN38_SSSE3
1612static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
1613                                 uint8* dst_ptr, int dst_width) {
1614  asm volatile (
1615    "movdqa    %3,%%xmm4                       \n"
1616    "movdqa    %4,%%xmm5                       \n"
1617    ".p2align  4                               \n"
1618  "1:                                          \n"
1619    "movdqa    (%0),%%xmm0                     \n"
1620    "movdqa    0x10(%0),%%xmm1                 \n"
1621    "lea       0x20(%0),%0                     \n"
1622    "pshufb    %%xmm4,%%xmm0                   \n"
1623    "pshufb    %%xmm5,%%xmm1                   \n"
1624    "paddusb   %%xmm1,%%xmm0                   \n"
1625    "movq      %%xmm0,(%1)                     \n"
1626    "movhlps   %%xmm0,%%xmm1                   \n"
1627    "movd      %%xmm1,0x8(%1)                  \n"
1628    "lea       0xc(%1),%1                      \n"
1629    "sub       $0xc,%2                         \n"
1630    "jg        1b                              \n"
1631  : "+r"(src_ptr),   // %0
1632    "+r"(dst_ptr),   // %1
1633    "+r"(dst_width)  // %2
1634  : "m"(kShuf38a),   // %3
1635    "m"(kShuf38b)    // %4
1636  : "memory", "cc"
1637#if defined(__SSE2__)
1638      , "xmm0", "xmm1", "xmm4", "xmm5"
1639#endif
1640  );
1641}
1642
1643static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr,
1644                                       ptrdiff_t src_stride,
1645                                       uint8* dst_ptr, int dst_width) {
1646  asm volatile (
1647    "movdqa    %0,%%xmm2                       \n"
1648    "movdqa    %1,%%xmm3                       \n"
1649    "movdqa    %2,%%xmm4                       \n"
1650    "movdqa    %3,%%xmm5                       \n"
1651  :
1652  : "m"(kShufAb0),   // %0
1653    "m"(kShufAb1),   // %1
1654    "m"(kShufAb2),   // %2
1655    "m"(kScaleAb2)   // %3
1656  );
1657  asm volatile (
1658    ".p2align  4                               \n"
1659  "1:                                          \n"
1660    "movdqa    (%0),%%xmm0                     \n"
1661    "pavgb     (%0,%3,1),%%xmm0                \n"
1662    "lea       0x10(%0),%0                     \n"
1663    "movdqa    %%xmm0,%%xmm1                   \n"
1664    "pshufb    %%xmm2,%%xmm1                   \n"
1665    "movdqa    %%xmm0,%%xmm6                   \n"
1666    "pshufb    %%xmm3,%%xmm6                   \n"
1667    "paddusw   %%xmm6,%%xmm1                   \n"
1668    "pshufb    %%xmm4,%%xmm0                   \n"
1669    "paddusw   %%xmm0,%%xmm1                   \n"
1670    "pmulhuw   %%xmm5,%%xmm1                   \n"
1671    "packuswb  %%xmm1,%%xmm1                   \n"
1672    "sub       $0x6,%2                         \n"
1673    "movd      %%xmm1,(%1)                     \n"
1674    "psrlq     $0x10,%%xmm1                    \n"
1675    "movd      %%xmm1,0x2(%1)                  \n"
1676    "lea       0x6(%1),%1                      \n"
1677    "jg        1b                              \n"
1678  : "+r"(src_ptr),     // %0
1679    "+r"(dst_ptr),     // %1
1680    "+r"(dst_width)    // %2
1681  : "r"(static_cast<intptr_t>(src_stride))  // %3
1682  : "memory", "cc"
1683#if defined(__SSE2__)
1684    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1685#endif
1686  );
1687}
1688
1689static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr,
1690                                       ptrdiff_t src_stride,
1691                                       uint8* dst_ptr, int dst_width) {
1692  asm volatile (
1693    "movdqa    %0,%%xmm2                       \n"
1694    "movdqa    %1,%%xmm3                       \n"
1695    "movdqa    %2,%%xmm4                       \n"
1696    "pxor      %%xmm5,%%xmm5                   \n"
1697  :
1698  : "m"(kShufAc),    // %0
1699    "m"(kShufAc3),   // %1
1700    "m"(kScaleAc33)  // %2
1701  );
1702  asm volatile (
1703    ".p2align  4                               \n"
1704  "1:                                          \n"
1705    "movdqa    (%0),%%xmm0                     \n"
1706    "movdqa    (%0,%3,1),%%xmm6                \n"
1707    "movhlps   %%xmm0,%%xmm1                   \n"
1708    "movhlps   %%xmm6,%%xmm7                   \n"
1709    "punpcklbw %%xmm5,%%xmm0                   \n"
1710    "punpcklbw %%xmm5,%%xmm1                   \n"
1711    "punpcklbw %%xmm5,%%xmm6                   \n"
1712    "punpcklbw %%xmm5,%%xmm7                   \n"
1713    "paddusw   %%xmm6,%%xmm0                   \n"
1714    "paddusw   %%xmm7,%%xmm1                   \n"
1715    "movdqa    (%0,%3,2),%%xmm6                \n"
1716    "lea       0x10(%0),%0                     \n"
1717    "movhlps   %%xmm6,%%xmm7                   \n"
1718    "punpcklbw %%xmm5,%%xmm6                   \n"
1719    "punpcklbw %%xmm5,%%xmm7                   \n"
1720    "paddusw   %%xmm6,%%xmm0                   \n"
1721    "paddusw   %%xmm7,%%xmm1                   \n"
1722    "movdqa    %%xmm0,%%xmm6                   \n"
1723    "psrldq    $0x2,%%xmm0                     \n"
1724    "paddusw   %%xmm0,%%xmm6                   \n"
1725    "psrldq    $0x2,%%xmm0                     \n"
1726    "paddusw   %%xmm0,%%xmm6                   \n"
1727    "pshufb    %%xmm2,%%xmm6                   \n"
1728    "movdqa    %%xmm1,%%xmm7                   \n"
1729    "psrldq    $0x2,%%xmm1                     \n"
1730    "paddusw   %%xmm1,%%xmm7                   \n"
1731    "psrldq    $0x2,%%xmm1                     \n"
1732    "paddusw   %%xmm1,%%xmm7                   \n"
1733    "pshufb    %%xmm3,%%xmm7                   \n"
1734    "paddusw   %%xmm7,%%xmm6                   \n"
1735    "pmulhuw   %%xmm4,%%xmm6                   \n"
1736    "packuswb  %%xmm6,%%xmm6                   \n"
1737    "sub       $0x6,%2                         \n"
1738    "movd      %%xmm6,(%1)                     \n"
1739    "psrlq     $0x10,%%xmm6                    \n"
1740    "movd      %%xmm6,0x2(%1)                  \n"
1741    "lea       0x6(%1),%1                      \n"
1742    "jg        1b                              \n"
1743  : "+r"(src_ptr),    // %0
1744    "+r"(dst_ptr),    // %1
1745    "+r"(dst_width)   // %2
1746  : "r"(static_cast<intptr_t>(src_stride))   // %3
1747  : "memory", "cc"
1748#if defined(__SSE2__)
1749    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1750#endif
1751  );
1752}
1753
1754#define HAS_SCALEADDROWS_SSE2
1755static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
1756                              uint16* dst_ptr, int src_width, int src_height) {
1757  int tmp_height = 0;
1758  intptr_t tmp_src = 0;
1759  asm volatile (
1760    "pxor      %%xmm4,%%xmm4                   \n"
1761    "sub       $0x1,%5                         \n"
1762    ".p2align  4                               \n"
1763  "1:                                          \n"
1764    "movdqa    (%0),%%xmm0                     \n"
1765    "mov       %0,%3                           \n"
1766    "add       %6,%0                           \n"
1767    "movdqa    %%xmm0,%%xmm1                   \n"
1768    "punpcklbw %%xmm4,%%xmm0                   \n"
1769    "punpckhbw %%xmm4,%%xmm1                   \n"
1770    "mov       %5,%2                           \n"
1771    "test      %2,%2                           \n"
1772    "je        3f                              \n"
1773  "2:                                          \n"
1774    "movdqa    (%0),%%xmm2                     \n"
1775    "add       %6,%0                           \n"
1776    "movdqa    %%xmm2,%%xmm3                   \n"
1777    "punpcklbw %%xmm4,%%xmm2                   \n"
1778    "punpckhbw %%xmm4,%%xmm3                   \n"
1779    "paddusw   %%xmm2,%%xmm0                   \n"
1780    "paddusw   %%xmm3,%%xmm1                   \n"
1781    "sub       $0x1,%2                         \n"
1782    "jg        2b                              \n"
1783  "3:                                          \n"
1784    "movdqa    %%xmm0,(%1)                     \n"
1785    "movdqa    %%xmm1,0x10(%1)                 \n"
1786    "lea       0x10(%3),%0                     \n"
1787    "lea       0x20(%1),%1                     \n"
1788    "sub       $0x10,%4                        \n"
1789    "jg        1b                              \n"
1790  : "+r"(src_ptr),     // %0
1791    "+r"(dst_ptr),     // %1
1792    "+r"(tmp_height),  // %2
1793    "+r"(tmp_src),     // %3
1794    "+r"(src_width),   // %4
1795    "+rm"(src_height)  // %5
1796  : "rm"(static_cast<intptr_t>(src_stride))  // %6
1797  : "memory", "cc"
1798#if defined(__SSE2__)
1799    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1800#endif
1801  );
1802}
1803
1804#ifndef SSE2_DISABLED
1805// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version
1806#define HAS_SCALEFILTERROWS_SSE2_DISABLED
1807static void ScaleFilterRows_SSE2(uint8* dst_ptr,
1808                                 const uint8* src_ptr, ptrdiff_t src_stride,
1809                                 int dst_width, int source_y_fraction) {
1810  asm volatile (
1811    "sub       %1,%0                           \n"
1812    "cmp       $0x0,%3                         \n"
1813    "je        2f                              \n"
1814    "cmp       $0x80,%3                        \n"
1815    "je        3f                              \n"
1816    "movd      %3,%%xmm5                       \n"
1817    "punpcklbw %%xmm5,%%xmm5                   \n"
1818    "punpcklwd %%xmm5,%%xmm5                   \n"
1819    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
1820    "pxor      %%xmm4,%%xmm4                   \n"
1821    ".p2align  4                               \n"
1822  "1:                                          \n"
1823    "movdqa    (%1),%%xmm0                     \n"
1824    "movdqa    (%1,%4,1),%%xmm2                \n"
1825    "movdqa    %%xmm0,%%xmm1                   \n"
1826    "movdqa    %%xmm2,%%xmm3                   \n"
1827    "punpcklbw %%xmm4,%%xmm2                   \n"
1828    "punpckhbw %%xmm4,%%xmm3                   \n"
1829    "punpcklbw %%xmm4,%%xmm0                   \n"
1830    "punpckhbw %%xmm4,%%xmm1                   \n"
1831    "psubw     %%xmm0,%%xmm2                   \n"
1832    "psubw     %%xmm1,%%xmm3                   \n"
1833    "pmulhw    %%xmm5,%%xmm2                   \n"
1834    "pmulhw    %%xmm5,%%xmm3                   \n"
1835    "paddw     %%xmm2,%%xmm0                   \n"
1836    "paddw     %%xmm3,%%xmm1                   \n"
1837    "packuswb  %%xmm1,%%xmm0                   \n"
1838    "sub       $0x10,%2                        \n"
1839    "movdqa    %%xmm0,(%1,%0,1)                \n"
1840    "lea       0x10(%1),%1                     \n"
1841    "jg        1b                              \n"
1842    "jmp       4f                              \n"
1843    ".p2align  4                               \n"
1844  "2:                                          \n"
1845    "movdqa    (%1),%%xmm0                     \n"
1846    "sub       $0x10,%2                        \n"
1847    "movdqa    %%xmm0,(%1,%0,1)                \n"
1848    "lea       0x10(%1),%1                     \n"
1849    "jg        2b                              \n"
1850    "jmp       4f                              \n"
1851    ".p2align  4                               \n"
1852  "3:                                          \n"
1853    "movdqa    (%1),%%xmm0                     \n"
1854    "pavgb     (%1,%4,1),%%xmm0                \n"
1855    "sub       $0x10,%2                        \n"
1856    "movdqa    %%xmm0,(%1,%0,1)                \n"
1857    "lea       0x10(%1),%1                     \n"
1858    "jg        3b                              \n"
1859    ".p2align  4                               \n"
1860  "4:                                          \n"
1861    "punpckhbw %%xmm0,%%xmm0                   \n"
1862    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
1863    "punpckhqdq %%xmm0,%%xmm0                  \n"
1864    "movdqa    %%xmm0,(%1,%0,1)                \n"
1865  : "+r"(dst_ptr),    // %0
1866    "+r"(src_ptr),    // %1
1867    "+r"(dst_width),  // %2
1868    "+r"(source_y_fraction)  // %3
1869  : "r"(static_cast<intptr_t>(src_stride))  // %4
1870  : "memory", "cc"
1871#if defined(__SSE2__)
1872    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1873#endif
1874  );
1875}
1876#endif  // SSE2_DISABLED
1877
1878// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version
1879#define HAS_SCALEFILTERROWS_SSSE3
1880static void ScaleFilterRows_SSSE3(uint8* dst_ptr,
1881                                  const uint8* src_ptr, ptrdiff_t src_stride,
1882                                  int dst_width, int source_y_fraction) {
1883  asm volatile (
1884    "sub       %1,%0                           \n"
1885    "shr       %3                              \n"
1886    "cmp       $0x0,%3                         \n"
1887    "je        2f                              \n"
1888    "cmp       $0x40,%3                        \n"
1889    "je        3f                              \n"
1890    "movd      %3,%%xmm0                       \n"
1891    "neg       %3                              \n"
1892    "add       $0x80,%3                        \n"
1893    "movd      %3,%%xmm5                       \n"
1894    "punpcklbw %%xmm0,%%xmm5                   \n"
1895    "punpcklwd %%xmm5,%%xmm5                   \n"
1896    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
1897    ".p2align  4                               \n"
1898  "1:                                          \n"
1899    "movdqa    (%1),%%xmm0                     \n"
1900    "movdqa    (%1,%4,1),%%xmm2                \n"
1901    "movdqa    %%xmm0,%%xmm1                   \n"
1902    "punpcklbw %%xmm2,%%xmm0                   \n"
1903    "punpckhbw %%xmm2,%%xmm1                   \n"
1904    "pmaddubsw %%xmm5,%%xmm0                   \n"
1905    "pmaddubsw %%xmm5,%%xmm1                   \n"
1906    "psrlw     $0x7,%%xmm0                     \n"
1907    "psrlw     $0x7,%%xmm1                     \n"
1908    "packuswb  %%xmm1,%%xmm0                   \n"
1909    "sub       $0x10,%2                        \n"
1910    "movdqa    %%xmm0,(%1,%0,1)                \n"
1911    "lea       0x10(%1),%1                     \n"
1912    "jg        1b                              \n"
1913    "jmp       4f                              \n"
1914    ".p2align  4                               \n"
1915  "2:                                          \n"
1916    "movdqa    (%1),%%xmm0                     \n"
1917    "sub       $0x10,%2                        \n"
1918    "movdqa    %%xmm0,(%1,%0,1)                \n"
1919    "lea       0x10(%1),%1                     \n"
1920    "jg        2b                              \n"
1921    "jmp       4f                              \n"
1922    ".p2align  4                               \n"
1923  "3:                                          \n"
1924    "movdqa    (%1),%%xmm0                     \n"
1925    "pavgb     (%1,%4,1),%%xmm0                \n"
1926    "sub       $0x10,%2                        \n"
1927    "movdqa    %%xmm0,(%1,%0,1)                \n"
1928    "lea       0x10(%1),%1                     \n"
1929    "jg        3b                              \n"
1930    ".p2align  4                               \n"
1931  "4:                                          \n"
1932    "punpckhbw %%xmm0,%%xmm0                   \n"
1933    "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
1934    "punpckhqdq %%xmm0,%%xmm0                  \n"
1935    "movdqa    %%xmm0,(%1,%0,1)                \n"
1936  : "+r"(dst_ptr),    // %0
1937    "+r"(src_ptr),    // %1
1938    "+r"(dst_width),  // %2
1939    "+r"(source_y_fraction)  // %3
1940  : "r"(static_cast<intptr_t>(src_stride))  // %4
1941  : "memory", "cc"
1942#if defined(__SSE2__)
1943    , "xmm0", "xmm1", "xmm2", "xmm5"
1944#endif
1945  );
1946}
1947#endif  // defined(__x86_64__) || defined(__i386__)
1948
1949// CPU agnostic row functions
1950static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
1951                            uint8* dst, int dst_width) {
1952  uint8* dend = dst + dst_width - 1;
1953  do {
1954    dst[0] = src_ptr[0];
1955    dst[1] = src_ptr[2];
1956    dst += 2;
1957    src_ptr += 4;
1958  } while (dst < dend);
1959  if (dst_width & 1) {
1960    dst[0] = src_ptr[0];
1961  }
1962}
1963
1964void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
1965                        uint8* dst, int dst_width) {
1966  const uint8* s = src_ptr;
1967  const uint8* t = src_ptr + src_stride;
1968  uint8* dend = dst + dst_width - 1;
1969  do {
1970    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
1971    dst[1] = (s[2] + s[3] + t[2] + t[3] + 2) >> 2;
1972    dst += 2;
1973    s += 4;
1974    t += 4;
1975  } while (dst < dend);
1976  if (dst_width & 1) {
1977    dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
1978  }
1979}
1980
1981static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
1982                            uint8* dst, int dst_width) {
1983  uint8* dend = dst + dst_width - 1;
1984  do {
1985    dst[0] = src_ptr[0];
1986    dst[1] = src_ptr[4];
1987    dst += 2;
1988    src_ptr += 8;
1989  } while (dst < dend);
1990  if (dst_width & 1) {
1991    dst[0] = src_ptr[0];
1992  }
1993}
1994
1995static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
1996                               uint8* dst, int dst_width) {
1997  intptr_t stride = src_stride;
1998  uint8* dend = dst + dst_width - 1;
1999  do {
2000    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2001             src_ptr[stride + 0] + src_ptr[stride + 1] +
2002             src_ptr[stride + 2] + src_ptr[stride + 3] +
2003             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
2004             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
2005             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
2006             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
2007             8) >> 4;
2008    dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
2009             src_ptr[stride + 4] + src_ptr[stride + 5] +
2010             src_ptr[stride + 6] + src_ptr[stride + 7] +
2011             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
2012             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
2013             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
2014             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
2015             8) >> 4;
2016    dst += 2;
2017    src_ptr += 8;
2018  } while (dst < dend);
2019  if (dst_width & 1) {
2020    dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
2021             src_ptr[stride + 0] + src_ptr[stride + 1] +
2022             src_ptr[stride + 2] + src_ptr[stride + 3] +
2023             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
2024             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
2025             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
2026             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
2027             8) >> 4;
2028  }
2029}
2030
2031// 640 output pixels is enough to allow 5120 input pixels with 1/8 scale down.
2032// Keeping the total buffer under 4096 bytes avoids a stackcheck, saving 4% cpu.
2033static const int kMaxOutputWidth = 640;
2034static const int kMaxRow12 = kMaxOutputWidth * 2;
2035
2036static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
2037                            uint8* dst, int dst_width) {
2038  uint8* dend = dst + dst_width - 1;
2039  do {
2040    dst[0] = src_ptr[0];
2041    dst[1] = src_ptr[8];
2042    dst += 2;
2043    src_ptr += 16;
2044  } while (dst < dend);
2045  if (dst_width & 1) {
2046    dst[0] = src_ptr[0];
2047  }
2048}
2049
2050// Note calling code checks width is less than max and if not
2051// uses ScaleRowDown8_C instead.
2052static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
2053                               uint8* dst, int dst_width) {
2054  SIMD_ALIGNED(uint8 src_row[kMaxRow12 * 2]);
2055  assert(dst_width <= kMaxOutputWidth);
2056  ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2);
2057  ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride,
2058                     src_row + kMaxOutputWidth,
2059                     dst_width * 2);
2060  ScaleRowDown2Int_C(src_row, kMaxOutputWidth, dst, dst_width);
2061}
2062
2063static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
2064                             uint8* dst, int dst_width) {
2065  assert((dst_width % 3 == 0) && (dst_width > 0));
2066  uint8* dend = dst + dst_width;
2067  do {
2068    dst[0] = src_ptr[0];
2069    dst[1] = src_ptr[1];
2070    dst[2] = src_ptr[3];
2071    dst += 3;
2072    src_ptr += 4;
2073  } while (dst < dend);
2074}
2075
2076// Filter rows 0 and 1 together, 3 : 1
2077static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
2078                                   uint8* d, int dst_width) {
2079  assert((dst_width % 3 == 0) && (dst_width > 0));
2080  const uint8* s = src_ptr;
2081  const uint8* t = src_ptr + src_stride;
2082  uint8* dend = d + dst_width;
2083  do {
2084    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2085    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2086    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2087    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2088    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2089    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2090    d[0] = (a0 * 3 + b0 + 2) >> 2;
2091    d[1] = (a1 * 3 + b1 + 2) >> 2;
2092    d[2] = (a2 * 3 + b2 + 2) >> 2;
2093    d += 3;
2094    s += 4;
2095    t += 4;
2096  } while (d < dend);
2097}
2098
2099// Filter rows 1 and 2 together, 1 : 1
2100static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
2101                                   uint8* d, int dst_width) {
2102  assert((dst_width % 3 == 0) && (dst_width > 0));
2103  const uint8* s = src_ptr;
2104  const uint8* t = src_ptr + src_stride;
2105  uint8* dend = d + dst_width;
2106  do {
2107    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2108    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2109    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2110    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
2111    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
2112    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
2113    d[0] = (a0 + b0 + 1) >> 1;
2114    d[1] = (a1 + b1 + 1) >> 1;
2115    d[2] = (a2 + b2 + 1) >> 1;
2116    d += 3;
2117    s += 4;
2118    t += 4;
2119  } while (d < dend);
2120}
2121
2122// (1-f)a + fb can be replaced with a + f(b-a)
2123#define BLENDER(a, b, f) (static_cast<int>(a) + \
2124    ((f) * (static_cast<int>(b) - static_cast<int>(a)) >> 16))
2125
2126static void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
2127                              int dst_width, int x, int dx) {
2128  for (int j = 0; j < dst_width - 1; j += 2) {
2129    int xi = x >> 16;
2130    int a = src_ptr[xi];
2131    int b = src_ptr[xi + 1];
2132    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
2133    x += dx;
2134    xi = x >> 16;
2135    a = src_ptr[xi];
2136    b = src_ptr[xi + 1];
2137    dst_ptr[1] = BLENDER(a, b, x & 0xffff);
2138    x += dx;
2139    dst_ptr += 2;
2140  }
2141  if (dst_width & 1) {
2142    int xi = x >> 16;
2143    int a = src_ptr[xi];
2144    int b = src_ptr[xi + 1];
2145    dst_ptr[0] = BLENDER(a, b, x & 0xffff);
2146  }
2147}
2148
2149static const int kMaxInputWidth = 2560;
2150
2151#if defined(HAS_SCALEFILTERROWS_SSE2)
2152// Filter row to 3/4
2153static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr,
2154                                int dst_width) {
2155  assert((dst_width % 3 == 0) && (dst_width > 0));
2156  const uint8* s = src_ptr;
2157  uint8* dend = dst_ptr + dst_width;
2158  do {
2159    dst_ptr[0] = (s[0] * 3 + s[1] * 1 + 2) >> 2;
2160    dst_ptr[1] = (s[1] * 1 + s[2] * 1 + 1) >> 1;
2161    dst_ptr[2] = (s[2] * 1 + s[3] * 3 + 2) >> 2;
2162    dst_ptr += 3;
2163    s += 4;
2164  } while (dst_ptr < dend);
2165}
2166
2167#define HAS_SCALEROWDOWN34_SSE2_DISABLED
2168// Filter rows 0 and 1 together, 3 : 1
2169static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr,
2170                                      ptrdiff_t src_stride,
2171                                      uint8* dst_ptr, int dst_width) {
2172  assert((dst_width % 3 == 0) && (dst_width > 0));
2173  SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
2174  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 4);
2175  ScaleFilterCols34_C(dst_ptr, row, dst_width);
2176}
2177
2178// Filter rows 1 and 2 together, 1 : 1
2179static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr,
2180                                      ptrdiff_t src_stride,
2181                                      uint8* dst_ptr, int dst_width) {
2182  assert((dst_width % 3 == 0) && (dst_width > 0));
2183  SIMD_ALIGNED(uint8 row[kMaxInputWidth]);
2184  ScaleFilterRows_SSE2(row, src_ptr, src_stride, dst_width * 4 / 3, 256 / 2);
2185  ScaleFilterCols34_C(dst_ptr, row, dst_width);
2186}
2187#endif
2188
2189static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
2190                             uint8* dst, int dst_width) {
2191  assert(dst_width % 3 == 0);
2192  for (int x = 0; x < dst_width; x += 3) {
2193    dst[0] = src_ptr[0];
2194    dst[1] = src_ptr[3];
2195    dst[2] = src_ptr[6];
2196    dst += 3;
2197    src_ptr += 8;
2198  }
2199}
2200
2201// 8x3 -> 3x1
2202static void ScaleRowDown38_3_Int_C(const uint8* src_ptr,
2203                                   ptrdiff_t src_stride,
2204                                   uint8* dst_ptr, int dst_width) {
2205  assert((dst_width % 3 == 0) && (dst_width > 0));
2206  intptr_t stride = src_stride;
2207  for (int i = 0; i < dst_width; i += 3) {
2208    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2209        src_ptr[stride + 0] + src_ptr[stride + 1] +
2210        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
2211        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
2212        (65536 / 9) >> 16;
2213    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2214        src_ptr[stride + 3] + src_ptr[stride + 4] +
2215        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
2216        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
2217        (65536 / 9) >> 16;
2218    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2219        src_ptr[stride + 6] + src_ptr[stride + 7] +
2220        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
2221        (65536 / 6) >> 16;
2222    src_ptr += 8;
2223    dst_ptr += 3;
2224  }
2225}
2226
2227// 8x2 -> 3x1
2228static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride,
2229                                   uint8* dst_ptr, int dst_width) {
2230  assert((dst_width % 3 == 0) && (dst_width > 0));
2231  intptr_t stride = src_stride;
2232  for (int i = 0; i < dst_width; i += 3) {
2233    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
2234        src_ptr[stride + 0] + src_ptr[stride + 1] +
2235        src_ptr[stride + 2]) * (65536 / 6) >> 16;
2236    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
2237        src_ptr[stride + 3] + src_ptr[stride + 4] +
2238        src_ptr[stride + 5]) * (65536 / 6) >> 16;
2239    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
2240        src_ptr[stride + 6] + src_ptr[stride + 7]) *
2241        (65536 / 4) >> 16;
2242    src_ptr += 8;
2243    dst_ptr += 3;
2244  }
2245}
2246
2247// C version 8x2 -> 8x1
2248static void ScaleFilterRows_C(uint8* dst_ptr,
2249                              const uint8* src_ptr, ptrdiff_t src_stride,
2250                              int dst_width, int source_y_fraction) {
2251  assert(dst_width > 0);
2252  int y1_fraction = source_y_fraction;
2253  int y0_fraction = 256 - y1_fraction;
2254  const uint8* src_ptr1 = src_ptr + src_stride;
2255  uint8* end = dst_ptr + dst_width;
2256  do {
2257    dst_ptr[0] = (src_ptr[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
2258    dst_ptr[1] = (src_ptr[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
2259    dst_ptr[2] = (src_ptr[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
2260    dst_ptr[3] = (src_ptr[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
2261    dst_ptr[4] = (src_ptr[4] * y0_fraction + src_ptr1[4] * y1_fraction) >> 8;
2262    dst_ptr[5] = (src_ptr[5] * y0_fraction + src_ptr1[5] * y1_fraction) >> 8;
2263    dst_ptr[6] = (src_ptr[6] * y0_fraction + src_ptr1[6] * y1_fraction) >> 8;
2264    dst_ptr[7] = (src_ptr[7] * y0_fraction + src_ptr1[7] * y1_fraction) >> 8;
2265    src_ptr += 8;
2266    src_ptr1 += 8;
2267    dst_ptr += 8;
2268  } while (dst_ptr < end);
2269  dst_ptr[0] = dst_ptr[-1];
2270}
2271
2272void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
2273                    uint16* dst_ptr, int src_width, int src_height) {
2274  assert(src_width > 0);
2275  assert(src_height > 0);
2276  for (int x = 0; x < src_width; ++x) {
2277    const uint8* s = src_ptr + x;
2278    int sum = 0;
2279    for (int y = 0; y < src_height; ++y) {
2280      sum += s[0];
2281      s += src_stride;
2282    }
2283    dst_ptr[x] = sum;
2284  }
2285}
2286
2287/**
2288 * Scale plane, 1/2
2289 *
2290 * This is an optimized version for scaling down a plane to 1/2 of
2291 * its original size.
2292 *
2293 */
2294static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
2295                            int dst_width, int dst_height,
2296                            int src_stride, int dst_stride,
2297                            const uint8* src_ptr, uint8* dst_ptr,
2298                            FilterMode filtering) {
2299  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
2300                        uint8* dst_ptr, int dst_width) =
2301      filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C;
2302#if defined(HAS_SCALEROWDOWN2_NEON)
2303  if (TestCpuFlag(kCpuHasNEON) &&
2304      IS_ALIGNED(dst_width, 16)) {
2305    ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON;
2306  }
2307#elif defined(HAS_SCALEROWDOWN2_SSE2)
2308  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
2309    ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 :
2310        ScaleRowDown2_Unaligned_SSE2;
2311    if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) &&
2312        IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) {
2313      ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2;
2314    }
2315  }
2316#endif
2317
2318  // TODO(fbarchard): Loop through source height to allow odd height.
2319  for (int y = 0; y < dst_height; ++y) {
2320    ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width);
2321    src_ptr += (src_stride << 1);
2322    dst_ptr += dst_stride;
2323  }
2324}
2325
2326/**
2327 * Scale plane, 1/4
2328 *
2329 * This is an optimized version for scaling down a plane to 1/4 of
2330 * its original size.
2331 */
2332static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
2333                            int dst_width, int dst_height,
2334                            int src_stride, int dst_stride,
2335                            const uint8* src_ptr, uint8* dst_ptr,
2336                            FilterMode filtering) {
2337  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
2338                        uint8* dst_ptr, int dst_width) =
2339      filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C;
2340#if defined(HAS_SCALEROWDOWN4_NEON)
2341  if (TestCpuFlag(kCpuHasNEON) &&
2342      IS_ALIGNED(dst_width, 4)) {
2343    ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON;
2344  }
2345#elif defined(HAS_SCALEROWDOWN4_SSE2)
2346  if (TestCpuFlag(kCpuHasSSE2) &&
2347      IS_ALIGNED(dst_width, 8) &&
2348      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
2349    ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
2350  }
2351#endif
2352
2353  for (int y = 0; y < dst_height; ++y) {
2354    ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width);
2355    src_ptr += (src_stride << 2);
2356    dst_ptr += dst_stride;
2357  }
2358}
2359
2360/**
2361 * Scale plane, 1/8
2362 *
2363 * This is an optimized version for scaling down a plane to 1/8
2364 * of its original size.
2365 *
2366 */
2367static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
2368                            int dst_width, int dst_height,
2369                            int src_stride, int dst_stride,
2370                            const uint8* src_ptr, uint8* dst_ptr,
2371                            FilterMode filtering) {
2372  void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride,
2373                        uint8* dst_ptr, int dst_width) =
2374      filtering && (dst_width <= kMaxOutputWidth) ?
2375      ScaleRowDown8Int_C : ScaleRowDown8_C;
2376#if defined(HAS_SCALEROWDOWN8_SSE2)
2377  if (TestCpuFlag(kCpuHasSSE2) &&
2378      IS_ALIGNED(dst_width, 4) &&
2379      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
2380    ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2;
2381  }
2382#endif
2383
2384  for (int y = 0; y < dst_height; ++y) {
2385    ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width);
2386    src_ptr += (src_stride << 3);
2387    dst_ptr += dst_stride;
2388  }
2389}
2390
2391/**
2392 * Scale plane down, 3/4
2393 *
2394 * Provided by Frank Barchard (fbarchard@google.com)
2395 *
2396 */
2397static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
2398                             int dst_width, int dst_height,
2399                             int src_stride, int dst_stride,
2400                             const uint8* src_ptr, uint8* dst_ptr,
2401                             FilterMode filtering) {
2402  assert(dst_width % 3 == 0);
2403  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
2404                           uint8* dst_ptr, int dst_width);
2405  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
2406                           uint8* dst_ptr, int dst_width);
2407  if (!filtering) {
2408    ScaleRowDown34_0 = ScaleRowDown34_C;
2409    ScaleRowDown34_1 = ScaleRowDown34_C;
2410  } else {
2411    ScaleRowDown34_0 = ScaleRowDown34_0_Int_C;
2412    ScaleRowDown34_1 = ScaleRowDown34_1_Int_C;
2413  }
2414#if defined(HAS_SCALEROWDOWN34_NEON)
2415  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) {
2416    if (!filtering) {
2417      ScaleRowDown34_0 = ScaleRowDown34_NEON;
2418      ScaleRowDown34_1 = ScaleRowDown34_NEON;
2419    } else {
2420      ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON;
2421      ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON;
2422    }
2423  }
2424#endif
2425#if defined(HAS_SCALEROWDOWN34_SSE2)
2426  if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) &&
2427      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) {
2428    ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2;
2429    ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2;
2430  }
2431#endif
2432#if defined(HAS_SCALEROWDOWN34_SSSE3)
2433  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
2434      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
2435    if (!filtering) {
2436      ScaleRowDown34_0 = ScaleRowDown34_SSSE3;
2437      ScaleRowDown34_1 = ScaleRowDown34_SSSE3;
2438    } else {
2439      ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3;
2440      ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3;
2441    }
2442  }
2443#endif
2444
2445  for (int y = 0; y < dst_height - 2; y += 3) {
2446    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
2447    src_ptr += src_stride;
2448    dst_ptr += dst_stride;
2449    ScaleRowDown34_1(src_ptr, src_stride, dst_ptr, dst_width);
2450    src_ptr += src_stride;
2451    dst_ptr += dst_stride;
2452    ScaleRowDown34_0(src_ptr + src_stride, -src_stride,
2453                     dst_ptr, dst_width);
2454    src_ptr += src_stride * 2;
2455    dst_ptr += dst_stride;
2456  }
2457
2458  // Remainder 1 or 2 rows with last row vertically unfiltered
2459  if ((dst_height % 3) == 2) {
2460    ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
2461    src_ptr += src_stride;
2462    dst_ptr += dst_stride;
2463    ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width);
2464  } else if ((dst_height % 3) == 1) {
2465    ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width);
2466  }
2467}
2468
2469/**
2470 * Scale plane, 3/8
2471 *
2472 * This is an optimized version for scaling down a plane to 3/8
2473 * of its original size.
2474 *
2475 * Uses box filter arranges like this
2476 * aaabbbcc -> abc
2477 * aaabbbcc    def
2478 * aaabbbcc    ghi
2479 * dddeeeff
2480 * dddeeeff
2481 * dddeeeff
2482 * ggghhhii
2483 * ggghhhii
2484 * Boxes are 3x3, 2x3, 3x2 and 2x2
2485 */
2486static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
2487                             int dst_width, int dst_height,
2488                             int src_stride, int dst_stride,
2489                             const uint8* src_ptr, uint8* dst_ptr,
2490                             FilterMode filtering) {
2491  assert(dst_width % 3 == 0);
2492  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
2493                           uint8* dst_ptr, int dst_width);
2494  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
2495                           uint8* dst_ptr, int dst_width);
2496  if (!filtering) {
2497    ScaleRowDown38_3 = ScaleRowDown38_C;
2498    ScaleRowDown38_2 = ScaleRowDown38_C;
2499  } else {
2500    ScaleRowDown38_3 = ScaleRowDown38_3_Int_C;
2501    ScaleRowDown38_2 = ScaleRowDown38_2_Int_C;
2502  }
2503#if defined(HAS_SCALEROWDOWN38_NEON)
2504  if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) {
2505    if (!filtering) {
2506      ScaleRowDown38_3 = ScaleRowDown38_NEON;
2507      ScaleRowDown38_2 = ScaleRowDown38_NEON;
2508    } else {
2509      ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON;
2510      ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON;
2511    }
2512  }
2513#elif defined(HAS_SCALEROWDOWN38_SSSE3)
2514  if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) &&
2515      IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
2516    if (!filtering) {
2517      ScaleRowDown38_3 = ScaleRowDown38_SSSE3;
2518      ScaleRowDown38_2 = ScaleRowDown38_SSSE3;
2519    } else {
2520      ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3;
2521      ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
2522    }
2523  }
2524#endif
2525
2526  for (int y = 0; y < dst_height - 2; y += 3) {
2527    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2528    src_ptr += src_stride * 3;
2529    dst_ptr += dst_stride;
2530    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2531    src_ptr += src_stride * 3;
2532    dst_ptr += dst_stride;
2533    ScaleRowDown38_2(src_ptr, src_stride, dst_ptr, dst_width);
2534    src_ptr += src_stride * 2;
2535    dst_ptr += dst_stride;
2536  }
2537
2538  // Remainder 1 or 2 rows with last row vertically unfiltered
2539  if ((dst_height % 3) == 2) {
2540    ScaleRowDown38_3(src_ptr, src_stride, dst_ptr, dst_width);
2541    src_ptr += src_stride * 3;
2542    dst_ptr += dst_stride;
2543    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
2544  } else if ((dst_height % 3) == 1) {
2545    ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width);
2546  }
2547}
2548
2549static __inline uint32 SumBox(int iboxwidth, int iboxheight,
2550                              ptrdiff_t src_stride, const uint8* src_ptr) {
2551  assert(iboxwidth > 0);
2552  assert(iboxheight > 0);
2553  uint32 sum = 0u;
2554  for (int y = 0; y < iboxheight; ++y) {
2555    for (int x = 0; x < iboxwidth; ++x) {
2556      sum += src_ptr[x];
2557    }
2558    src_ptr += src_stride;
2559  }
2560  return sum;
2561}
2562
2563static void ScalePlaneBoxRow_C(int dst_width, int boxheight,
2564                               int x, int dx, ptrdiff_t src_stride,
2565                               const uint8* src_ptr, uint8* dst_ptr) {
2566  for (int i = 0; i < dst_width; ++i) {
2567    int ix = x >> 16;
2568    x += dx;
2569    int boxwidth = (x >> 16) - ix;
2570    *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) /
2571        (boxwidth * boxheight);
2572  }
2573}
2574
2575static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
2576  assert(iboxwidth > 0);
2577  uint32 sum = 0u;
2578  for (int x = 0; x < iboxwidth; ++x) {
2579    sum += src_ptr[x];
2580  }
2581  return sum;
2582}
2583
2584static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
2585                            const uint16* src_ptr, uint8* dst_ptr) {
2586  int scaletbl[2];
2587  int minboxwidth = (dx >> 16);
2588  scaletbl[0] = 65536 / (minboxwidth * boxheight);
2589  scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight);
2590  int *scaleptr = scaletbl - minboxwidth;
2591  for (int i = 0; i < dst_width; ++i) {
2592    int ix = x >> 16;
2593    x += dx;
2594    int boxwidth = (x >> 16) - ix;
2595    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16;
2596  }
2597}
2598
2599static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
2600                            const uint16* src_ptr, uint8* dst_ptr) {
2601  int boxwidth = (dx >> 16);
2602  int scaleval = 65536 / (boxwidth * boxheight);
2603  for (int i = 0; i < dst_width; ++i) {
2604    *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16;
2605    x += boxwidth;
2606  }
2607}
2608
2609/**
2610 * Scale plane down to any dimensions, with interpolation.
2611 * (boxfilter).
2612 *
2613 * Same method as SimpleScale, which is fixed point, outputting
2614 * one pixel of destination using fixed point (16.16) to step
2615 * through source, sampling a box of pixel with simple
2616 * averaging.
2617 */
2618static void ScalePlaneBox(int src_width, int src_height,
2619                          int dst_width, int dst_height,
2620                          int src_stride, int dst_stride,
2621                          const uint8* src_ptr, uint8* dst_ptr) {
2622  assert(dst_width > 0);
2623  assert(dst_height > 0);
2624  int dx = (src_width << 16) / dst_width;
2625  int dy = (src_height << 16) / dst_height;
2626  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
2627  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
2628  int maxy = (src_height << 16);
2629  if (!IS_ALIGNED(src_width, 16) || (src_width > kMaxInputWidth) ||
2630      dst_height * 2 > src_height) {
2631    uint8* dst = dst_ptr;
2632    for (int j = 0; j < dst_height; ++j) {
2633      int iy = y >> 16;
2634      const uint8* src = src_ptr + iy * src_stride;
2635      y += dy;
2636      if (y > maxy) {
2637        y = maxy;
2638      }
2639      int boxheight = (y >> 16) - iy;
2640      ScalePlaneBoxRow_C(dst_width, boxheight,
2641                         x, dx, src_stride,
2642                         src, dst);
2643      dst += dst_stride;
2644    }
2645  } else {
2646    SIMD_ALIGNED(uint16 row[kMaxInputWidth]);
2647    void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride,
2648                         uint16* dst_ptr, int src_width, int src_height)=
2649        ScaleAddRows_C;
2650    void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
2651                         const uint16* src_ptr, uint8* dst_ptr);
2652    if (dx & 0xffff) {
2653      ScaleAddCols = ScaleAddCols2_C;
2654    } else {
2655      ScaleAddCols = ScaleAddCols1_C;
2656    }
2657#if defined(HAS_SCALEADDROWS_SSE2)
2658    if (TestCpuFlag(kCpuHasSSE2) &&
2659        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
2660      ScaleAddRows = ScaleAddRows_SSE2;
2661    }
2662#endif
2663
2664    for (int j = 0; j < dst_height; ++j) {
2665      int iy = y >> 16;
2666      const uint8* src = src_ptr + iy * src_stride;
2667      y += dy;
2668      if (y > (src_height << 16)) {
2669        y = (src_height << 16);
2670      }
2671      int boxheight = (y >> 16) - iy;
2672      ScaleAddRows(src, src_stride, row, src_width, boxheight);
2673      ScaleAddCols(dst_width, boxheight, x, dx, row, dst_ptr);
2674      dst_ptr += dst_stride;
2675    }
2676  }
2677}
2678
2679/**
2680 * Scale plane to/from any dimensions, with interpolation.
2681 */
2682static void ScalePlaneBilinearSimple(int src_width, int src_height,
2683                                     int dst_width, int dst_height,
2684                                     int src_stride, int dst_stride,
2685                                     const uint8* src_ptr, uint8* dst_ptr) {
2686  int dx = (src_width << 16) / dst_width;
2687  int dy = (src_height << 16) / dst_height;
2688  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
2689  int maxx = (src_width > 1) ? ((src_width - 1) << 16) - 1 : 0;
2690  int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
2691  for (int i = 0; i < dst_height; ++i) {
2692    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
2693    int yi = y >> 16;
2694    int yf = y & 0xffff;
2695    const uint8* src0 = src_ptr + yi * src_stride;
2696    const uint8* src1 = (yi < src_height - 1) ? src0 + src_stride : src0;
2697    uint8* dst = dst_ptr;
2698    for (int j = 0; j < dst_width; ++j) {
2699      int xi = x >> 16;
2700      int xf = x & 0xffff;
2701      int x1 = (xi < src_width - 1) ? xi + 1 : xi;
2702      int a = src0[xi];
2703      int b = src0[x1];
2704      int r0 = BLENDER(a, b, xf);
2705      a = src1[xi];
2706      b = src1[x1];
2707      int r1 = BLENDER(a, b, xf);
2708      *dst++ = BLENDER(r0, r1, yf);
2709      x += dx;
2710      if (x > maxx)
2711        x = maxx;
2712    }
2713    dst_ptr += dst_stride;
2714    y += dy;
2715    if (y > maxy)
2716      y = maxy;
2717  }
2718}
2719
2720/**
2721 * Scale plane to/from any dimensions, with bilinear
2722 * interpolation.
2723 */
2724void ScalePlaneBilinear(int src_width, int src_height,
2725                        int dst_width, int dst_height,
2726                        int src_stride, int dst_stride,
2727                        const uint8* src_ptr, uint8* dst_ptr) {
2728  assert(dst_width > 0);
2729  assert(dst_height > 0);
2730  if (!IS_ALIGNED(src_width, 8) || (src_width > kMaxInputWidth)) {
2731    ScalePlaneBilinearSimple(src_width, src_height, dst_width, dst_height,
2732                             src_stride, dst_stride, src_ptr, dst_ptr);
2733
2734  } else {
2735    SIMD_ALIGNED(uint8 row[kMaxInputWidth + 16]);
2736    void (*ScaleFilterRows)(uint8* dst_ptr, const uint8* src_ptr,
2737                            ptrdiff_t src_stride,
2738                            int dst_width, int source_y_fraction) =
2739        ScaleFilterRows_C;
2740#if defined(HAS_SCALEFILTERROWS_NEON)
2741    if (TestCpuFlag(kCpuHasNEON)) {
2742      ScaleFilterRows = ScaleFilterRows_NEON;
2743    }
2744#endif
2745#if defined(HAS_SCALEFILTERROWS_SSE2)
2746    if (TestCpuFlag(kCpuHasSSE2) &&
2747        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
2748      ScaleFilterRows = ScaleFilterRows_SSE2;
2749    }
2750#endif
2751#if defined(HAS_SCALEFILTERROWS_SSSE3)
2752    if (TestCpuFlag(kCpuHasSSSE3) &&
2753        IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_ptr, 16)) {
2754      ScaleFilterRows = ScaleFilterRows_SSSE3;
2755    }
2756#endif
2757
2758    int dx = (src_width << 16) / dst_width;
2759    int dy = (src_height << 16) / dst_height;
2760    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
2761    int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
2762    int maxy = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
2763    for (int j = 0; j < dst_height; ++j) {
2764      int yi = y >> 16;
2765      int yf = (y >> 8) & 255;
2766      const uint8* src = src_ptr + yi * src_stride;
2767      ScaleFilterRows(row, src, src_stride, src_width, yf);
2768      ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
2769      dst_ptr += dst_stride;
2770      y += dy;
2771      if (y > maxy) {
2772        y = maxy;
2773      }
2774    }
2775  }
2776}
2777
2778/**
2779 * Scale plane to/from any dimensions, without interpolation.
2780 * Fixed point math is used for performance: The upper 16 bits
2781 * of x and dx is the integer part of the source position and
2782 * the lower 16 bits are the fixed decimal part.
2783 */
2784static void ScalePlaneSimple(int src_width, int src_height,
2785                             int dst_width, int dst_height,
2786                             int src_stride, int dst_stride,
2787                             const uint8* src_ptr, uint8* dst_ptr) {
2788  int dx = (src_width << 16) / dst_width;
2789  int dy = (src_height << 16) / dst_height;
2790  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
2791  for (int j = 0; j < dst_height; ++j) {
2792    int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
2793    int yi = y >> 16;
2794    const uint8* src = src_ptr + yi * src_stride;
2795    uint8* dst = dst_ptr;
2796    for (int i = 0; i < dst_width; ++i) {
2797      *dst++ = src[x >> 16];
2798      x += dx;
2799    }
2800    dst_ptr += dst_stride;
2801    y += dy;
2802  }
2803}
2804
2805/**
2806 * Scale plane to/from any dimensions.
2807 */
2808static void ScalePlaneAnySize(int src_width, int src_height,
2809                              int dst_width, int dst_height,
2810                              int src_stride, int dst_stride,
2811                              const uint8* src_ptr, uint8* dst_ptr,
2812                              FilterMode filtering) {
2813  if (!filtering) {
2814    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
2815                     src_stride, dst_stride, src_ptr, dst_ptr);
2816  } else {
2817    // fall back to non-optimized version
2818    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
2819                       src_stride, dst_stride, src_ptr, dst_ptr);
2820  }
2821}
2822
2823/**
2824 * Scale plane down, any size
2825 *
2826 * This is an optimized version for scaling down a plane to any size.
2827 * The current implementation is ~10 times faster compared to the
2828 * reference implementation for e.g. XGA->LowResPAL
2829 *
2830 */
2831static void ScalePlaneDown(int src_width, int src_height,
2832                           int dst_width, int dst_height,
2833                           int src_stride, int dst_stride,
2834                           const uint8* src_ptr, uint8* dst_ptr,
2835                           FilterMode filtering) {
2836  if (!filtering) {
2837    ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
2838                     src_stride, dst_stride, src_ptr, dst_ptr);
2839  } else if (filtering == kFilterBilinear || src_height * 2 > dst_height) {
2840    // between 1/2x and 1x use bilinear
2841    ScalePlaneBilinear(src_width, src_height, dst_width, dst_height,
2842                       src_stride, dst_stride, src_ptr, dst_ptr);
2843  } else {
2844    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
2845                  src_stride, dst_stride, src_ptr, dst_ptr);
2846  }
2847}
2848
2849// Scale a plane.
2850// This function in turn calls a scaling function suitable for handling
2851// the desired resolutions.
2852
2853LIBYUV_API
2854void ScalePlane(const uint8* src, int src_stride,
2855                int src_width, int src_height,
2856                uint8* dst, int dst_stride,
2857                int dst_width, int dst_height,
2858                FilterMode filtering) {
2859#ifdef CPU_X86
2860  // environment variable overrides for testing.
2861  char *filter_override = getenv("LIBYUV_FILTER");
2862  if (filter_override) {
2863    filtering = (FilterMode)atoi(filter_override);  // NOLINT
2864  }
2865#endif
2866  // Use specialized scales to improve performance for common resolutions.
2867  // For example, all the 1/2 scalings will use ScalePlaneDown2()
2868  if (dst_width == src_width && dst_height == src_height) {
2869    // Straight copy.
2870    CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height);
2871  } else if (dst_width <= src_width && dst_height <= src_height) {
2872    // Scale down.
2873    if (use_reference_impl_) {
2874      // For testing, allow the optimized versions to be disabled.
2875      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
2876                     src_stride, dst_stride, src, dst, filtering);
2877    } else if (4 * dst_width == 3 * src_width &&
2878               4 * dst_height == 3 * src_height) {
2879      // optimized, 3/4
2880      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
2881                       src_stride, dst_stride, src, dst, filtering);
2882    } else if (2 * dst_width == src_width && 2 * dst_height == src_height) {
2883      // optimized, 1/2
2884      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
2885                      src_stride, dst_stride, src, dst, filtering);
2886    // 3/8 rounded up for odd sized chroma height.
2887    } else if (8 * dst_width == 3 * src_width &&
2888               dst_height == ((src_height * 3 + 7) / 8)) {
2889      // optimized, 3/8
2890      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
2891                       src_stride, dst_stride, src, dst, filtering);
2892    } else if (4 * dst_width == src_width && 4 * dst_height == src_height &&
2893               filtering != kFilterBilinear) {
2894      // optimized, 1/4
2895      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
2896                      src_stride, dst_stride, src, dst, filtering);
2897    } else if (8 * dst_width == src_width && 8 * dst_height == src_height &&
2898               filtering != kFilterBilinear) {
2899      // optimized, 1/8
2900      ScalePlaneDown8(src_width, src_height, dst_width, dst_height,
2901                      src_stride, dst_stride, src, dst, filtering);
2902    } else {
2903      // Arbitrary downsample
2904      ScalePlaneDown(src_width, src_height, dst_width, dst_height,
2905                     src_stride, dst_stride, src, dst, filtering);
2906    }
2907  } else {
2908    // Arbitrary scale up and/or down.
2909    ScalePlaneAnySize(src_width, src_height, dst_width, dst_height,
2910                      src_stride, dst_stride, src, dst, filtering);
2911  }
2912}
2913
2914// Scale an I420 image.
2915// This function in turn calls a scaling function for each plane.
2916
2917#define UNDER_ALLOCATED_HACK 1
2918
2919LIBYUV_API
2920int I420Scale(const uint8* src_y, int src_stride_y,
2921              const uint8* src_u, int src_stride_u,
2922              const uint8* src_v, int src_stride_v,
2923              int src_width, int src_height,
2924              uint8* dst_y, int dst_stride_y,
2925              uint8* dst_u, int dst_stride_u,
2926              uint8* dst_v, int dst_stride_v,
2927              int dst_width, int dst_height,
2928              FilterMode filtering) {
2929  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
2930      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
2931    return -1;
2932  }
2933  // Negative height means invert the image.
2934  if (src_height < 0) {
2935    src_height = -src_height;
2936    int halfheight = (src_height + 1) >> 1;
2937    src_y = src_y + (src_height - 1) * src_stride_y;
2938    src_u = src_u + (halfheight - 1) * src_stride_u;
2939    src_v = src_v + (halfheight - 1) * src_stride_v;
2940    src_stride_y = -src_stride_y;
2941    src_stride_u = -src_stride_u;
2942    src_stride_v = -src_stride_v;
2943  }
2944  int src_halfwidth = (src_width + 1) >> 1;
2945  int src_halfheight = (src_height + 1) >> 1;
2946  int dst_halfwidth = (dst_width + 1) >> 1;
2947  int dst_halfheight = (dst_height + 1) >> 1;
2948
2949#ifdef UNDER_ALLOCATED_HACK
2950  // If caller passed width / 2 for stride, adjust halfwidth to match.
2951  if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
2952    src_halfwidth = src_width >> 1;
2953  }
2954  if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
2955    dst_halfwidth = dst_width >> 1;
2956  }
2957  // If caller used height / 2 when computing src_v, it will point into what
2958  // should be the src_u plane. Detect this and reduce halfheight to match.
2959  int uv_src_plane_size = src_halfwidth * src_halfheight;
2960  if ((src_height & 1) &&
2961      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
2962    src_halfheight = src_height >> 1;
2963  }
2964  int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
2965  if ((dst_height & 1) &&
2966      (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
2967    dst_halfheight = dst_height >> 1;
2968  }
2969#endif
2970
2971  ScalePlane(src_y, src_stride_y, src_width, src_height,
2972             dst_y, dst_stride_y, dst_width, dst_height,
2973             filtering);
2974  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
2975             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
2976             filtering);
2977  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
2978             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
2979             filtering);
2980  return 0;
2981}
2982
2983// Deprecated api
2984LIBYUV_API
2985int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
2986          int src_stride_y, int src_stride_u, int src_stride_v,
2987          int src_width, int src_height,
2988          uint8* dst_y, uint8* dst_u, uint8* dst_v,
2989          int dst_stride_y, int dst_stride_u, int dst_stride_v,
2990          int dst_width, int dst_height,
2991          bool interpolate) {
2992  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
2993      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
2994    return -1;
2995  }
2996  // Negative height means invert the image.
2997  if (src_height < 0) {
2998    src_height = -src_height;
2999    int halfheight = (src_height + 1) >> 1;
3000    src_y = src_y + (src_height - 1) * src_stride_y;
3001    src_u = src_u + (halfheight - 1) * src_stride_u;
3002    src_v = src_v + (halfheight - 1) * src_stride_v;
3003    src_stride_y = -src_stride_y;
3004    src_stride_u = -src_stride_u;
3005    src_stride_v = -src_stride_v;
3006  }
3007  int src_halfwidth = (src_width + 1) >> 1;
3008  int src_halfheight = (src_height + 1) >> 1;
3009  int dst_halfwidth = (dst_width + 1) >> 1;
3010  int dst_halfheight = (dst_height + 1) >> 1;
3011  FilterMode filtering = interpolate ? kFilterBox : kFilterNone;
3012
3013#ifdef UNDER_ALLOCATED_HACK
3014  // If caller passed width / 2 for stride, adjust halfwidth to match.
3015  if ((src_width & 1) && src_stride_u && src_halfwidth > abs(src_stride_u)) {
3016    src_halfwidth = src_width >> 1;
3017  }
3018  if ((dst_width & 1) && dst_stride_u && dst_halfwidth > abs(dst_stride_u)) {
3019    dst_halfwidth = dst_width >> 1;
3020  }
3021  // If caller used height / 2 when computing src_v, it will point into what
3022  // should be the src_u plane. Detect this and reduce halfheight to match.
3023  int uv_src_plane_size = src_halfwidth * src_halfheight;
3024  if ((src_height & 1) &&
3025      (src_v > src_u) && (src_v < (src_u + uv_src_plane_size))) {
3026    src_halfheight = src_height >> 1;
3027  }
3028  int uv_dst_plane_size = dst_halfwidth * dst_halfheight;
3029  if ((dst_height & 1) &&
3030      (dst_v > dst_u) && (dst_v < (dst_u + uv_dst_plane_size))) {
3031    dst_halfheight = dst_height >> 1;
3032  }
3033#endif
3034
3035  ScalePlane(src_y, src_stride_y, src_width, src_height,
3036             dst_y, dst_stride_y, dst_width, dst_height,
3037             filtering);
3038  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
3039             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
3040             filtering);
3041  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
3042             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
3043             filtering);
3044  return 0;
3045}
3046
3047// Deprecated api
3048LIBYUV_API
3049int ScaleOffset(const uint8* src, int src_width, int src_height,
3050                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
3051                bool interpolate) {
3052  if (!src || src_width <= 0 || src_height <= 0 ||
3053      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 ||
3054      dst_yoffset >= dst_height) {
3055    return -1;
3056  }
3057  dst_yoffset = dst_yoffset & ~1;  // chroma requires offset to multiple of 2.
3058  int src_halfwidth = (src_width + 1) >> 1;
3059  int src_halfheight = (src_height + 1) >> 1;
3060  int dst_halfwidth = (dst_width + 1) >> 1;
3061  int dst_halfheight = (dst_height + 1) >> 1;
3062  int aheight = dst_height - dst_yoffset * 2;  // actual output height
3063  const uint8* src_y = src;
3064  const uint8* src_u = src + src_width * src_height;
3065  const uint8* src_v = src + src_width * src_height +
3066                             src_halfwidth * src_halfheight;
3067  uint8* dst_y = dst + dst_yoffset * dst_width;
3068  uint8* dst_u = dst + dst_width * dst_height +
3069                 (dst_yoffset >> 1) * dst_halfwidth;
3070  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
3071                 (dst_yoffset >> 1) * dst_halfwidth;
3072  return Scale(src_y, src_u, src_v, src_width, src_halfwidth, src_halfwidth,
3073               src_width, src_height, dst_y, dst_u, dst_v, dst_width,
3074               dst_halfwidth, dst_halfwidth, dst_width, aheight, interpolate);
3075}
3076
3077#ifdef __cplusplus
3078}  // extern "C"
3079}  // namespace libyuv
3080#endif
3081