1/*
2 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12#include "libyuv/scale_row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for Visual C x86.
20#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
21    defined(_MSC_VER) && !defined(__clang__)
22
23// Offsets for source bytes 0 to 9
24static uvec8 kShuf0 =
25  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
26
27// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28static uvec8 kShuf1 =
29  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
30
31// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32static uvec8 kShuf2 =
33  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
34
35// Offsets for source bytes 0 to 10
36static uvec8 kShuf01 =
37  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
38
39// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
40static uvec8 kShuf11 =
41  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
42
43// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
44static uvec8 kShuf21 =
45  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
46
47// Coefficients for source bytes 0 to 10
48static uvec8 kMadd01 =
49  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
50
51// Coefficients for source bytes 10 to 21
52static uvec8 kMadd11 =
53  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
54
55// Coefficients for source bytes 21 to 31
56static uvec8 kMadd21 =
57  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
58
59// Coefficients for source bytes 21 to 31
60static vec16 kRound34 =
61  { 2, 2, 2, 2, 2, 2, 2, 2 };
62
63static uvec8 kShuf38a =
64  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
65
66static uvec8 kShuf38b =
67  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
68
69// Arrange words 0,3,6 into 0,1,2
70static uvec8 kShufAc =
71  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
72
73// Arrange words 0,3,6 into 3,4,5
74static uvec8 kShufAc3 =
75  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
76
77// Scaling values for boxes of 3x3 and 2x3
78static uvec16 kScaleAc33 =
79  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
80
81// Arrange first value for pixels 0,1,2,3,4,5
82static uvec8 kShufAb0 =
83  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
84
85// Arrange second value for pixels 0,1,2,3,4,5
86static uvec8 kShufAb1 =
87  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
88
89// Arrange third value for pixels 0,1,2,3,4,5
90static uvec8 kShufAb2 =
91  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
92
93// Scaling values for boxes of 3x2 and 2x2
94static uvec16 kScaleAb2 =
95  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
96
97// Reads 32 pixels, throws half away and writes 16 pixels.
98__declspec(naked)
99void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
100                        uint8* dst_ptr, int dst_width) {
101  __asm {
102    mov        eax, [esp + 4]        // src_ptr
103                                     // src_stride ignored
104    mov        edx, [esp + 12]       // dst_ptr
105    mov        ecx, [esp + 16]       // dst_width
106
107  wloop:
108    movdqu     xmm0, [eax]
109    movdqu     xmm1, [eax + 16]
110    lea        eax,  [eax + 32]
111    psrlw      xmm0, 8               // isolate odd pixels.
112    psrlw      xmm1, 8
113    packuswb   xmm0, xmm1
114    movdqu     [edx], xmm0
115    lea        edx, [edx + 16]
116    sub        ecx, 16
117    jg         wloop
118
119    ret
120  }
121}
122
123// Blends 32x1 rectangle to 16x1.
124__declspec(naked)
125void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
126                              uint8* dst_ptr, int dst_width) {
127  __asm {
128    mov        eax, [esp + 4]        // src_ptr
129                                     // src_stride
130    mov        edx, [esp + 12]       // dst_ptr
131    mov        ecx, [esp + 16]       // dst_width
132    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
133    psrlw      xmm5, 8
134
135  wloop:
136    movdqu     xmm0, [eax]
137    movdqu     xmm1, [eax + 16]
138    lea        eax,  [eax + 32]
139
140    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
141    psrlw      xmm0, 8
142    movdqa     xmm3, xmm1
143    psrlw      xmm1, 8
144    pand       xmm2, xmm5
145    pand       xmm3, xmm5
146    pavgw      xmm0, xmm2
147    pavgw      xmm1, xmm3
148    packuswb   xmm0, xmm1
149
150    movdqu     [edx], xmm0
151    lea        edx, [edx + 16]
152    sub        ecx, 16
153    jg         wloop
154
155    ret
156  }
157}
158
159// Blends 32x2 rectangle to 16x1.
160__declspec(naked)
161void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
162                           uint8* dst_ptr, int dst_width) {
163  __asm {
164    push       esi
165    mov        eax, [esp + 4 + 4]    // src_ptr
166    mov        esi, [esp + 4 + 8]    // src_stride
167    mov        edx, [esp + 4 + 12]   // dst_ptr
168    mov        ecx, [esp + 4 + 16]   // dst_width
169    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
170    psrlw      xmm5, 8
171
172  wloop:
173    movdqu     xmm0, [eax]
174    movdqu     xmm1, [eax + 16]
175    movdqu     xmm2, [eax + esi]
176    movdqu     xmm3, [eax + esi + 16]
177    lea        eax,  [eax + 32]
178    pavgb      xmm0, xmm2            // average rows
179    pavgb      xmm1, xmm3
180
181    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
182    psrlw      xmm0, 8
183    movdqa     xmm3, xmm1
184    psrlw      xmm1, 8
185    pand       xmm2, xmm5
186    pand       xmm3, xmm5
187    pavgw      xmm0, xmm2
188    pavgw      xmm1, xmm3
189    packuswb   xmm0, xmm1
190
191    movdqu     [edx], xmm0
192    lea        edx, [edx + 16]
193    sub        ecx, 16
194    jg         wloop
195
196    pop        esi
197    ret
198  }
199}
200
201#ifdef HAS_SCALEROWDOWN2_AVX2
202// Reads 64 pixels, throws half away and writes 32 pixels.
203__declspec(naked)
204void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
205                        uint8* dst_ptr, int dst_width) {
206  __asm {
207    mov        eax, [esp + 4]        // src_ptr
208                                     // src_stride ignored
209    mov        edx, [esp + 12]       // dst_ptr
210    mov        ecx, [esp + 16]       // dst_width
211
212  wloop:
213    vmovdqu     ymm0, [eax]
214    vmovdqu     ymm1, [eax + 32]
215    lea         eax,  [eax + 64]
216    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
217    vpsrlw      ymm1, ymm1, 8
218    vpackuswb   ymm0, ymm0, ymm1
219    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
220    vmovdqu     [edx], ymm0
221    lea         edx, [edx + 32]
222    sub         ecx, 32
223    jg          wloop
224
225    vzeroupper
226    ret
227  }
228}
229
230// Blends 64x1 rectangle to 32x1.
231__declspec(naked)
232void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
233                              uint8* dst_ptr, int dst_width) {
234  __asm {
235    mov         eax, [esp + 4]        // src_ptr
236                                      // src_stride
237    mov         edx, [esp + 12]       // dst_ptr
238    mov         ecx, [esp + 16]       // dst_width
239
240    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
241    vpsrlw      ymm4, ymm4, 15
242    vpackuswb   ymm4, ymm4, ymm4
243    vpxor       ymm5, ymm5, ymm5      // constant 0
244
245  wloop:
246    vmovdqu     ymm0, [eax]
247    vmovdqu     ymm1, [eax + 32]
248    lea         eax,  [eax + 64]
249
250    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
251    vpmaddubsw  ymm1, ymm1, ymm4
252    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
253    vpavgw      ymm1, ymm1, ymm5
254    vpackuswb   ymm0, ymm0, ymm1
255    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
256
257    vmovdqu     [edx], ymm0
258    lea         edx, [edx + 32]
259    sub         ecx, 32
260    jg          wloop
261
262    vzeroupper
263    ret
264  }
265}
266
267// Blends 64x2 rectangle to 32x1.
268__declspec(naked)
269void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
270                           uint8* dst_ptr, int dst_width) {
271  __asm {
272    push        esi
273    mov         eax, [esp + 4 + 4]    // src_ptr
274    mov         esi, [esp + 4 + 8]    // src_stride
275    mov         edx, [esp + 4 + 12]   // dst_ptr
276    mov         ecx, [esp + 4 + 16]   // dst_width
277
278    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
279    vpsrlw      ymm4, ymm4, 15
280    vpackuswb   ymm4, ymm4, ymm4
281    vpxor       ymm5, ymm5, ymm5      // constant 0
282
283  wloop:
284    vmovdqu     ymm0, [eax]           // average rows
285    vmovdqu     ymm1, [eax + 32]
286    vpavgb      ymm0, ymm0, [eax + esi]
287    vpavgb      ymm1, ymm1, [eax + esi + 32]
288    lea         eax,  [eax + 64]
289
290    vpmaddubsw  ymm0, ymm0, ymm4      // average horizontally
291    vpmaddubsw  ymm1, ymm1, ymm4
292    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
293    vpavgw      ymm1, ymm1, ymm5
294    vpackuswb   ymm0, ymm0, ymm1
295    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
296
297    vmovdqu     [edx], ymm0
298    lea         edx, [edx + 32]
299    sub         ecx, 32
300    jg          wloop
301
302    pop         esi
303    vzeroupper
304    ret
305  }
306}
307#endif  // HAS_SCALEROWDOWN2_AVX2
308
309// Point samples 32 pixels to 8 pixels.
310__declspec(naked)
311void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
312                        uint8* dst_ptr, int dst_width) {
313  __asm {
314    mov        eax, [esp + 4]        // src_ptr
315                                     // src_stride ignored
316    mov        edx, [esp + 12]       // dst_ptr
317    mov        ecx, [esp + 16]       // dst_width
318    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
319    psrld      xmm5, 24
320    pslld      xmm5, 16
321
322  wloop:
323    movdqu     xmm0, [eax]
324    movdqu     xmm1, [eax + 16]
325    lea        eax,  [eax + 32]
326    pand       xmm0, xmm5
327    pand       xmm1, xmm5
328    packuswb   xmm0, xmm1
329    psrlw      xmm0, 8
330    packuswb   xmm0, xmm0
331    movq       qword ptr [edx], xmm0
332    lea        edx, [edx + 8]
333    sub        ecx, 8
334    jg         wloop
335
336    ret
337  }
338}
339
340// Blends 32x4 rectangle to 8x1.
341__declspec(naked)
342void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
343                           uint8* dst_ptr, int dst_width) {
344  __asm {
345    push       esi
346    push       edi
347    mov        eax, [esp + 8 + 4]    // src_ptr
348    mov        esi, [esp + 8 + 8]    // src_stride
349    mov        edx, [esp + 8 + 12]   // dst_ptr
350    mov        ecx, [esp + 8 + 16]   // dst_width
351    lea        edi, [esi + esi * 2]  // src_stride * 3
352    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
353    psrlw      xmm7, 8
354
355  wloop:
356    movdqu     xmm0, [eax]           // average rows
357    movdqu     xmm1, [eax + 16]
358    movdqu     xmm2, [eax + esi]
359    movdqu     xmm3, [eax + esi + 16]
360    pavgb      xmm0, xmm2
361    pavgb      xmm1, xmm3
362    movdqu     xmm2, [eax + esi * 2]
363    movdqu     xmm3, [eax + esi * 2 + 16]
364    movdqu     xmm4, [eax + edi]
365    movdqu     xmm5, [eax + edi + 16]
366    lea        eax, [eax + 32]
367    pavgb      xmm2, xmm4
368    pavgb      xmm3, xmm5
369    pavgb      xmm0, xmm2
370    pavgb      xmm1, xmm3
371
372    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
373    psrlw      xmm0, 8
374    movdqa     xmm3, xmm1
375    psrlw      xmm1, 8
376    pand       xmm2, xmm7
377    pand       xmm3, xmm7
378    pavgw      xmm0, xmm2
379    pavgw      xmm1, xmm3
380    packuswb   xmm0, xmm1
381
382    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
383    psrlw      xmm0, 8
384    pand       xmm2, xmm7
385    pavgw      xmm0, xmm2
386    packuswb   xmm0, xmm0
387
388    movq       qword ptr [edx], xmm0
389    lea        edx, [edx + 8]
390    sub        ecx, 8
391    jg         wloop
392
393    pop        edi
394    pop        esi
395    ret
396  }
397}
398
399#ifdef HAS_SCALEROWDOWN4_AVX2
400// Point samples 64 pixels to 16 pixels.
401__declspec(naked)
402void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
403                        uint8* dst_ptr, int dst_width) {
404  __asm {
405    mov         eax, [esp + 4]        // src_ptr
406                                      // src_stride ignored
407    mov         edx, [esp + 12]       // dst_ptr
408    mov         ecx, [esp + 16]       // dst_width
409    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
410    vpsrld      ymm5, ymm5, 24
411    vpslld      ymm5, ymm5, 16
412
413  wloop:
414    vmovdqu     ymm0, [eax]
415    vmovdqu     ymm1, [eax + 32]
416    lea         eax,  [eax + 64]
417    vpand       ymm0, ymm0, ymm5
418    vpand       ymm1, ymm1, ymm5
419    vpackuswb   ymm0, ymm0, ymm1
420    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
421    vpsrlw      ymm0, ymm0, 8
422    vpackuswb   ymm0, ymm0, ymm0
423    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
424    vmovdqu     [edx], xmm0
425    lea         edx, [edx + 16]
426    sub         ecx, 16
427    jg          wloop
428
429    vzeroupper
430    ret
431  }
432}
433
434// Blends 64x4 rectangle to 16x1.
435__declspec(naked)
436void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
437                           uint8* dst_ptr, int dst_width) {
438  __asm {
439    push        esi
440    push        edi
441    mov         eax, [esp + 8 + 4]    // src_ptr
442    mov         esi, [esp + 8 + 8]    // src_stride
443    mov         edx, [esp + 8 + 12]   // dst_ptr
444    mov         ecx, [esp + 8 + 16]   // dst_width
445    lea         edi, [esi + esi * 2]  // src_stride * 3
446    vpcmpeqb    ymm7, ymm7, ymm7      // generate mask 0x00ff00ff
447    vpsrlw      ymm7, ymm7, 8
448
449  wloop:
450    vmovdqu     ymm0, [eax]           // average rows
451    vmovdqu     ymm1, [eax + 32]
452    vpavgb      ymm0, ymm0, [eax + esi]
453    vpavgb      ymm1, ymm1, [eax + esi + 32]
454    vmovdqu     ymm2, [eax + esi * 2]
455    vmovdqu     ymm3, [eax + esi * 2 + 32]
456    vpavgb      ymm2, ymm2, [eax + edi]
457    vpavgb      ymm3, ymm3, [eax + edi + 32]
458    lea         eax, [eax + 64]
459    vpavgb      ymm0, ymm0, ymm2
460    vpavgb      ymm1, ymm1, ymm3
461
462    vpand       ymm2, ymm0, ymm7      // average columns (64 to 32 pixels)
463    vpand       ymm3, ymm1, ymm7
464    vpsrlw      ymm0, ymm0, 8
465    vpsrlw      ymm1, ymm1, 8
466    vpavgw      ymm0, ymm0, ymm2
467    vpavgw      ymm1, ymm1, ymm3
468    vpackuswb   ymm0, ymm0, ymm1
469    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
470
471    vpand       ymm2, ymm0, ymm7      // average columns (32 to 16 pixels)
472    vpsrlw      ymm0, ymm0, 8
473    vpavgw      ymm0, ymm0, ymm2
474    vpackuswb   ymm0, ymm0, ymm0
475    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
476
477    vmovdqu     [edx], xmm0
478    lea         edx, [edx + 16]
479    sub         ecx, 16
480    jg          wloop
481
482    pop        edi
483    pop        esi
484    vzeroupper
485    ret
486  }
487}
488#endif  // HAS_SCALEROWDOWN4_AVX2
489
490// Point samples 32 pixels to 24 pixels.
491// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
492// Then shuffled to do the scaling.
493
494__declspec(naked)
495void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
496                          uint8* dst_ptr, int dst_width) {
497  __asm {
498    mov        eax, [esp + 4]        // src_ptr
499                                     // src_stride ignored
500    mov        edx, [esp + 12]       // dst_ptr
501    mov        ecx, [esp + 16]       // dst_width
502    movdqa     xmm3, kShuf0
503    movdqa     xmm4, kShuf1
504    movdqa     xmm5, kShuf2
505
506  wloop:
507    movdqu     xmm0, [eax]
508    movdqu     xmm1, [eax + 16]
509    lea        eax,  [eax + 32]
510    movdqa     xmm2, xmm1
511    palignr    xmm1, xmm0, 8
512    pshufb     xmm0, xmm3
513    pshufb     xmm1, xmm4
514    pshufb     xmm2, xmm5
515    movq       qword ptr [edx], xmm0
516    movq       qword ptr [edx + 8], xmm1
517    movq       qword ptr [edx + 16], xmm2
518    lea        edx, [edx + 24]
519    sub        ecx, 24
520    jg         wloop
521
522    ret
523  }
524}
525
526// Blends 32x2 rectangle to 24x1
527// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
528// Then shuffled to do the scaling.
529
530// Register usage:
531// xmm0 src_row 0
532// xmm1 src_row 1
533// xmm2 shuf 0
534// xmm3 shuf 1
535// xmm4 shuf 2
536// xmm5 madd 0
537// xmm6 madd 1
538// xmm7 kRound34
539
540// Note that movdqa+palign may be better than movdqu.
541__declspec(naked)
542void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
543                                ptrdiff_t src_stride,
544                                uint8* dst_ptr, int dst_width) {
545  __asm {
546    push       esi
547    mov        eax, [esp + 4 + 4]    // src_ptr
548    mov        esi, [esp + 4 + 8]    // src_stride
549    mov        edx, [esp + 4 + 12]   // dst_ptr
550    mov        ecx, [esp + 4 + 16]   // dst_width
551    movdqa     xmm2, kShuf01
552    movdqa     xmm3, kShuf11
553    movdqa     xmm4, kShuf21
554    movdqa     xmm5, kMadd01
555    movdqa     xmm6, kMadd11
556    movdqa     xmm7, kRound34
557
558  wloop:
559    movdqu     xmm0, [eax]           // pixels 0..7
560    movdqu     xmm1, [eax + esi]
561    pavgb      xmm0, xmm1
562    pshufb     xmm0, xmm2
563    pmaddubsw  xmm0, xmm5
564    paddsw     xmm0, xmm7
565    psrlw      xmm0, 2
566    packuswb   xmm0, xmm0
567    movq       qword ptr [edx], xmm0
568    movdqu     xmm0, [eax + 8]       // pixels 8..15
569    movdqu     xmm1, [eax + esi + 8]
570    pavgb      xmm0, xmm1
571    pshufb     xmm0, xmm3
572    pmaddubsw  xmm0, xmm6
573    paddsw     xmm0, xmm7
574    psrlw      xmm0, 2
575    packuswb   xmm0, xmm0
576    movq       qword ptr [edx + 8], xmm0
577    movdqu     xmm0, [eax + 16]      // pixels 16..23
578    movdqu     xmm1, [eax + esi + 16]
579    lea        eax, [eax + 32]
580    pavgb      xmm0, xmm1
581    pshufb     xmm0, xmm4
582    movdqa     xmm1, kMadd21
583    pmaddubsw  xmm0, xmm1
584    paddsw     xmm0, xmm7
585    psrlw      xmm0, 2
586    packuswb   xmm0, xmm0
587    movq       qword ptr [edx + 16], xmm0
588    lea        edx, [edx + 24]
589    sub        ecx, 24
590    jg         wloop
591
592    pop        esi
593    ret
594  }
595}
596
597// Note that movdqa+palign may be better than movdqu.
598__declspec(naked)
599void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
600                                ptrdiff_t src_stride,
601                                uint8* dst_ptr, int dst_width) {
602  __asm {
603    push       esi
604    mov        eax, [esp + 4 + 4]    // src_ptr
605    mov        esi, [esp + 4 + 8]    // src_stride
606    mov        edx, [esp + 4 + 12]   // dst_ptr
607    mov        ecx, [esp + 4 + 16]   // dst_width
608    movdqa     xmm2, kShuf01
609    movdqa     xmm3, kShuf11
610    movdqa     xmm4, kShuf21
611    movdqa     xmm5, kMadd01
612    movdqa     xmm6, kMadd11
613    movdqa     xmm7, kRound34
614
615  wloop:
616    movdqu     xmm0, [eax]           // pixels 0..7
617    movdqu     xmm1, [eax + esi]
618    pavgb      xmm1, xmm0
619    pavgb      xmm0, xmm1
620    pshufb     xmm0, xmm2
621    pmaddubsw  xmm0, xmm5
622    paddsw     xmm0, xmm7
623    psrlw      xmm0, 2
624    packuswb   xmm0, xmm0
625    movq       qword ptr [edx], xmm0
626    movdqu     xmm0, [eax + 8]       // pixels 8..15
627    movdqu     xmm1, [eax + esi + 8]
628    pavgb      xmm1, xmm0
629    pavgb      xmm0, xmm1
630    pshufb     xmm0, xmm3
631    pmaddubsw  xmm0, xmm6
632    paddsw     xmm0, xmm7
633    psrlw      xmm0, 2
634    packuswb   xmm0, xmm0
635    movq       qword ptr [edx + 8], xmm0
636    movdqu     xmm0, [eax + 16]      // pixels 16..23
637    movdqu     xmm1, [eax + esi + 16]
638    lea        eax, [eax + 32]
639    pavgb      xmm1, xmm0
640    pavgb      xmm0, xmm1
641    pshufb     xmm0, xmm4
642    movdqa     xmm1, kMadd21
643    pmaddubsw  xmm0, xmm1
644    paddsw     xmm0, xmm7
645    psrlw      xmm0, 2
646    packuswb   xmm0, xmm0
647    movq       qword ptr [edx + 16], xmm0
648    lea        edx, [edx+24]
649    sub        ecx, 24
650    jg         wloop
651
652    pop        esi
653    ret
654  }
655}
656
657// 3/8 point sampler
658
659// Scale 32 pixels to 12
660__declspec(naked)
661void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
662                          uint8* dst_ptr, int dst_width) {
663  __asm {
664    mov        eax, [esp + 4]        // src_ptr
665                                     // src_stride ignored
666    mov        edx, [esp + 12]       // dst_ptr
667    mov        ecx, [esp + 16]       // dst_width
668    movdqa     xmm4, kShuf38a
669    movdqa     xmm5, kShuf38b
670
671  xloop:
672    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
673    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
674    lea        eax, [eax + 32]
675    pshufb     xmm0, xmm4
676    pshufb     xmm1, xmm5
677    paddusb    xmm0, xmm1
678
679    movq       qword ptr [edx], xmm0  // write 12 pixels
680    movhlps    xmm1, xmm0
681    movd       [edx + 8], xmm1
682    lea        edx, [edx + 12]
683    sub        ecx, 12
684    jg         xloop
685
686    ret
687  }
688}
689
690// Scale 16x3 pixels to 6x1 with interpolation
691__declspec(naked)
692void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
693                                ptrdiff_t src_stride,
694                                uint8* dst_ptr, int dst_width) {
695  __asm {
696    push       esi
697    mov        eax, [esp + 4 + 4]    // src_ptr
698    mov        esi, [esp + 4 + 8]    // src_stride
699    mov        edx, [esp + 4 + 12]   // dst_ptr
700    mov        ecx, [esp + 4 + 16]   // dst_width
701    movdqa     xmm2, kShufAc
702    movdqa     xmm3, kShufAc3
703    movdqa     xmm4, kScaleAc33
704    pxor       xmm5, xmm5
705
706  xloop:
707    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
708    movdqu     xmm6, [eax + esi]
709    movhlps    xmm1, xmm0
710    movhlps    xmm7, xmm6
711    punpcklbw  xmm0, xmm5
712    punpcklbw  xmm1, xmm5
713    punpcklbw  xmm6, xmm5
714    punpcklbw  xmm7, xmm5
715    paddusw    xmm0, xmm6
716    paddusw    xmm1, xmm7
717    movdqu     xmm6, [eax + esi * 2]
718    lea        eax, [eax + 16]
719    movhlps    xmm7, xmm6
720    punpcklbw  xmm6, xmm5
721    punpcklbw  xmm7, xmm5
722    paddusw    xmm0, xmm6
723    paddusw    xmm1, xmm7
724
725    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
726    psrldq     xmm0, 2
727    paddusw    xmm6, xmm0
728    psrldq     xmm0, 2
729    paddusw    xmm6, xmm0
730    pshufb     xmm6, xmm2
731
732    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
733    psrldq     xmm1, 2
734    paddusw    xmm7, xmm1
735    psrldq     xmm1, 2
736    paddusw    xmm7, xmm1
737    pshufb     xmm7, xmm3
738    paddusw    xmm6, xmm7
739
740    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
741    packuswb   xmm6, xmm6
742
743    movd       [edx], xmm6           // write 6 pixels
744    psrlq      xmm6, 16
745    movd       [edx + 2], xmm6
746    lea        edx, [edx + 6]
747    sub        ecx, 6
748    jg         xloop
749
750    pop        esi
751    ret
752  }
753}
754
755// Scale 16x2 pixels to 6x1 with interpolation
756__declspec(naked)
757void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
758                                ptrdiff_t src_stride,
759                                uint8* dst_ptr, int dst_width) {
760  __asm {
761    push       esi
762    mov        eax, [esp + 4 + 4]    // src_ptr
763    mov        esi, [esp + 4 + 8]    // src_stride
764    mov        edx, [esp + 4 + 12]   // dst_ptr
765    mov        ecx, [esp + 4 + 16]   // dst_width
766    movdqa     xmm2, kShufAb0
767    movdqa     xmm3, kShufAb1
768    movdqa     xmm4, kShufAb2
769    movdqa     xmm5, kScaleAb2
770
771  xloop:
772    movdqu     xmm0, [eax]           // average 2 rows into xmm0
773    movdqu     xmm1, [eax + esi]
774    lea        eax, [eax + 16]
775    pavgb      xmm0, xmm1
776
777    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
778    pshufb     xmm1, xmm2
779    movdqa     xmm6, xmm0
780    pshufb     xmm6, xmm3
781    paddusw    xmm1, xmm6
782    pshufb     xmm0, xmm4
783    paddusw    xmm1, xmm0
784
785    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
786    packuswb   xmm1, xmm1
787
788    movd       [edx], xmm1           // write 6 pixels
789    psrlq      xmm1, 16
790    movd       [edx + 2], xmm1
791    lea        edx, [edx + 6]
792    sub        ecx, 6
793    jg         xloop
794
795    pop        esi
796    ret
797  }
798}
799
800// Reads 16 bytes and accumulates to 16 shorts at a time.
801__declspec(naked)
802void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
803  __asm {
804    mov        eax, [esp + 4]   // src_ptr
805    mov        edx, [esp + 8]   // dst_ptr
806    mov        ecx, [esp + 12]  // src_width
807    pxor       xmm5, xmm5
808
809  // sum rows
810  xloop:
811    movdqu     xmm3, [eax]       // read 16 bytes
812    lea        eax, [eax + 16]
813    movdqu     xmm0, [edx]       // read 16 words from destination
814    movdqu     xmm1, [edx + 16]
815    movdqa     xmm2, xmm3
816    punpcklbw  xmm2, xmm5
817    punpckhbw  xmm3, xmm5
818    paddusw    xmm0, xmm2        // sum 16 words
819    paddusw    xmm1, xmm3
820    movdqu     [edx], xmm0       // write 16 words to destination
821    movdqu     [edx + 16], xmm1
822    lea        edx, [edx + 32]
823    sub        ecx, 16
824    jg         xloop
825    ret
826  }
827}
828
829#ifdef HAS_SCALEADDROW_AVX2
830// Reads 32 bytes and accumulates to 32 shorts at a time.
831__declspec(naked)
832void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
833  __asm {
834    mov         eax, [esp + 4]   // src_ptr
835    mov         edx, [esp + 8]   // dst_ptr
836    mov         ecx, [esp + 12]  // src_width
837    vpxor       ymm5, ymm5, ymm5
838
839  // sum rows
840  xloop:
841    vmovdqu     ymm3, [eax]       // read 32 bytes
842    lea         eax, [eax + 32]
843    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
844    vpunpcklbw  ymm2, ymm3, ymm5
845    vpunpckhbw  ymm3, ymm3, ymm5
846    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
847    vpaddusw    ymm1, ymm3, [edx + 32]
848    vmovdqu     [edx], ymm0       // write 32 words to destination
849    vmovdqu     [edx + 32], ymm1
850    lea         edx, [edx + 64]
851    sub         ecx, 32
852    jg          xloop
853
854    vzeroupper
855    ret
856  }
857}
858#endif  // HAS_SCALEADDROW_AVX2
859
860// Bilinear column filtering. SSSE3 version.
861__declspec(naked)
862void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
863                           int dst_width, int x, int dx) {
864  __asm {
865    push       ebx
866    push       esi
867    push       edi
868    mov        edi, [esp + 12 + 4]    // dst_ptr
869    mov        esi, [esp + 12 + 8]    // src_ptr
870    mov        ecx, [esp + 12 + 12]   // dst_width
871    movd       xmm2, [esp + 12 + 16]  // x
872    movd       xmm3, [esp + 12 + 20]  // dx
873    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
874    movd       xmm5, eax
875    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
876    psrlw      xmm6, 9
877    pextrw     eax, xmm2, 1         // get x0 integer. preroll
878    sub        ecx, 2
879    jl         xloop29
880
881    movdqa     xmm0, xmm2           // x1 = x0 + dx
882    paddd      xmm0, xmm3
883    punpckldq  xmm2, xmm0           // x0 x1
884    punpckldq  xmm3, xmm3           // dx dx
885    paddd      xmm3, xmm3           // dx * 2, dx * 2
886    pextrw     edx, xmm2, 3         // get x1 integer. preroll
887
888    // 2 Pixel loop.
889  xloop2:
890    movdqa     xmm1, xmm2           // x0, x1 fractions.
891    paddd      xmm2, xmm3           // x += dx
892    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
893    movd       xmm0, ebx
894    psrlw      xmm1, 9              // 7 bit fractions.
895    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
896    movd       xmm4, ebx
897    pshufb     xmm1, xmm5           // 0011
898    punpcklwd  xmm0, xmm4
899    pxor       xmm1, xmm6           // 0..7f and 7f..0
900    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
901    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
902    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
903    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
904    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
905    movd       ebx, xmm0
906    mov        [edi], bx
907    lea        edi, [edi + 2]
908    sub        ecx, 2               // 2 pixels
909    jge        xloop2
910
911 xloop29:
912
913    add        ecx, 2 - 1
914    jl         xloop99
915
916    // 1 pixel remainder
917    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
918    movd       xmm0, ebx
919    psrlw      xmm2, 9              // 7 bit fractions.
920    pshufb     xmm2, xmm5           // 0011
921    pxor       xmm2, xmm6           // 0..7f and 7f..0
922    pmaddubsw  xmm0, xmm2           // 16 bit
923    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
924    packuswb   xmm0, xmm0           // 8 bits
925    movd       ebx, xmm0
926    mov        [edi], bl
927
928 xloop99:
929
930    pop        edi
931    pop        esi
932    pop        ebx
933    ret
934  }
935}
936
937// Reads 16 pixels, duplicates them and writes 32 pixels.
938__declspec(naked)
939void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
940                       int dst_width, int x, int dx) {
941  __asm {
942    mov        edx, [esp + 4]    // dst_ptr
943    mov        eax, [esp + 8]    // src_ptr
944    mov        ecx, [esp + 12]   // dst_width
945
946  wloop:
947    movdqu     xmm0, [eax]
948    lea        eax,  [eax + 16]
949    movdqa     xmm1, xmm0
950    punpcklbw  xmm0, xmm0
951    punpckhbw  xmm1, xmm1
952    movdqu     [edx], xmm0
953    movdqu     [edx + 16], xmm1
954    lea        edx, [edx + 32]
955    sub        ecx, 32
956    jg         wloop
957
958    ret
959  }
960}
961
962// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
963__declspec(naked)
964void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
965                            ptrdiff_t src_stride,
966                            uint8* dst_argb, int dst_width) {
967  __asm {
968    mov        eax, [esp + 4]        // src_argb
969                                     // src_stride ignored
970    mov        edx, [esp + 12]       // dst_argb
971    mov        ecx, [esp + 16]       // dst_width
972
973  wloop:
974    movdqu     xmm0, [eax]
975    movdqu     xmm1, [eax + 16]
976    lea        eax,  [eax + 32]
977    shufps     xmm0, xmm1, 0xdd
978    movdqu     [edx], xmm0
979    lea        edx, [edx + 16]
980    sub        ecx, 4
981    jg         wloop
982
983    ret
984  }
985}
986
987// Blends 8x1 rectangle to 4x1.
988__declspec(naked)
989void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
990                                  ptrdiff_t src_stride,
991                                  uint8* dst_argb, int dst_width) {
992  __asm {
993    mov        eax, [esp + 4]        // src_argb
994                                     // src_stride ignored
995    mov        edx, [esp + 12]       // dst_argb
996    mov        ecx, [esp + 16]       // dst_width
997
998  wloop:
999    movdqu     xmm0, [eax]
1000    movdqu     xmm1, [eax + 16]
1001    lea        eax,  [eax + 32]
1002    movdqa     xmm2, xmm0
1003    shufps     xmm0, xmm1, 0x88      // even pixels
1004    shufps     xmm2, xmm1, 0xdd      // odd pixels
1005    pavgb      xmm0, xmm2
1006    movdqu     [edx], xmm0
1007    lea        edx, [edx + 16]
1008    sub        ecx, 4
1009    jg         wloop
1010
1011    ret
1012  }
1013}
1014
1015// Blends 8x2 rectangle to 4x1.
1016__declspec(naked)
1017void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1018                               ptrdiff_t src_stride,
1019                               uint8* dst_argb, int dst_width) {
1020  __asm {
1021    push       esi
1022    mov        eax, [esp + 4 + 4]    // src_argb
1023    mov        esi, [esp + 4 + 8]    // src_stride
1024    mov        edx, [esp + 4 + 12]   // dst_argb
1025    mov        ecx, [esp + 4 + 16]   // dst_width
1026
1027  wloop:
1028    movdqu     xmm0, [eax]
1029    movdqu     xmm1, [eax + 16]
1030    movdqu     xmm2, [eax + esi]
1031    movdqu     xmm3, [eax + esi + 16]
1032    lea        eax,  [eax + 32]
1033    pavgb      xmm0, xmm2            // average rows
1034    pavgb      xmm1, xmm3
1035    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1036    shufps     xmm0, xmm1, 0x88      // even pixels
1037    shufps     xmm2, xmm1, 0xdd      // odd pixels
1038    pavgb      xmm0, xmm2
1039    movdqu     [edx], xmm0
1040    lea        edx, [edx + 16]
1041    sub        ecx, 4
1042    jg         wloop
1043
1044    pop        esi
1045    ret
1046  }
1047}
1048
1049// Reads 4 pixels at a time.
1050__declspec(naked)
1051void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1052                               int src_stepx,
1053                               uint8* dst_argb, int dst_width) {
1054  __asm {
1055    push       ebx
1056    push       edi
1057    mov        eax, [esp + 8 + 4]    // src_argb
1058                                     // src_stride ignored
1059    mov        ebx, [esp + 8 + 12]   // src_stepx
1060    mov        edx, [esp + 8 + 16]   // dst_argb
1061    mov        ecx, [esp + 8 + 20]   // dst_width
1062    lea        ebx, [ebx * 4]
1063    lea        edi, [ebx + ebx * 2]
1064
1065  wloop:
1066    movd       xmm0, [eax]
1067    movd       xmm1, [eax + ebx]
1068    punpckldq  xmm0, xmm1
1069    movd       xmm2, [eax + ebx * 2]
1070    movd       xmm3, [eax + edi]
1071    lea        eax,  [eax + ebx * 4]
1072    punpckldq  xmm2, xmm3
1073    punpcklqdq xmm0, xmm2
1074    movdqu     [edx], xmm0
1075    lea        edx, [edx + 16]
1076    sub        ecx, 4
1077    jg         wloop
1078
1079    pop        edi
1080    pop        ebx
1081    ret
1082  }
1083}
1084
1085// Blends four 2x2 to 4x1.
1086__declspec(naked)
1087void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1088                                  ptrdiff_t src_stride,
1089                                  int src_stepx,
1090                                  uint8* dst_argb, int dst_width) {
1091  __asm {
1092    push       ebx
1093    push       esi
1094    push       edi
1095    mov        eax, [esp + 12 + 4]    // src_argb
1096    mov        esi, [esp + 12 + 8]    // src_stride
1097    mov        ebx, [esp + 12 + 12]   // src_stepx
1098    mov        edx, [esp + 12 + 16]   // dst_argb
1099    mov        ecx, [esp + 12 + 20]   // dst_width
1100    lea        esi, [eax + esi]       // row1 pointer
1101    lea        ebx, [ebx * 4]
1102    lea        edi, [ebx + ebx * 2]
1103
1104  wloop:
1105    movq       xmm0, qword ptr [eax]  // row0 4 pairs
1106    movhps     xmm0, qword ptr [eax + ebx]
1107    movq       xmm1, qword ptr [eax + ebx * 2]
1108    movhps     xmm1, qword ptr [eax + edi]
1109    lea        eax,  [eax + ebx * 4]
1110    movq       xmm2, qword ptr [esi]  // row1 4 pairs
1111    movhps     xmm2, qword ptr [esi + ebx]
1112    movq       xmm3, qword ptr [esi + ebx * 2]
1113    movhps     xmm3, qword ptr [esi + edi]
1114    lea        esi,  [esi + ebx * 4]
1115    pavgb      xmm0, xmm2            // average rows
1116    pavgb      xmm1, xmm3
1117    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1118    shufps     xmm0, xmm1, 0x88      // even pixels
1119    shufps     xmm2, xmm1, 0xdd      // odd pixels
1120    pavgb      xmm0, xmm2
1121    movdqu     [edx], xmm0
1122    lea        edx, [edx + 16]
1123    sub        ecx, 4
1124    jg         wloop
1125
1126    pop        edi
1127    pop        esi
1128    pop        ebx
1129    ret
1130  }
1131}
1132
1133// Column scaling unfiltered. SSE2 version.
1134__declspec(naked)
1135void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1136                        int dst_width, int x, int dx) {
1137  __asm {
1138    push       edi
1139    push       esi
1140    mov        edi, [esp + 8 + 4]    // dst_argb
1141    mov        esi, [esp + 8 + 8]    // src_argb
1142    mov        ecx, [esp + 8 + 12]   // dst_width
1143    movd       xmm2, [esp + 8 + 16]  // x
1144    movd       xmm3, [esp + 8 + 20]  // dx
1145
1146    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
1147    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
1148    paddd      xmm2, xmm0
1149    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
1150    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
1151    paddd      xmm2, xmm0            // x3 x2 x1 x0
1152    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
1153    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
1154
1155    pextrw     eax, xmm2, 1          // get x0 integer.
1156    pextrw     edx, xmm2, 3          // get x1 integer.
1157
1158    cmp        ecx, 0
1159    jle        xloop99
1160    sub        ecx, 4
1161    jl         xloop49
1162
1163    // 4 Pixel loop.
1164 xloop4:
1165    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1166    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1167    pextrw     eax, xmm2, 5           // get x2 integer.
1168    pextrw     edx, xmm2, 7           // get x3 integer.
1169    paddd      xmm2, xmm3             // x += dx
1170    punpckldq  xmm0, xmm1             // x0 x1
1171
1172    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1173    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1174    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
1175    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
1176    punpckldq  xmm1, xmm4             // x2 x3
1177    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
1178    movdqu     [edi], xmm0
1179    lea        edi, [edi + 16]
1180    sub        ecx, 4                 // 4 pixels
1181    jge        xloop4
1182
1183 xloop49:
1184    test       ecx, 2
1185    je         xloop29
1186
1187    // 2 Pixels.
1188    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1189    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1190    pextrw     eax, xmm2, 5           // get x2 integer.
1191    punpckldq  xmm0, xmm1             // x0 x1
1192
1193    movq       qword ptr [edi], xmm0
1194    lea        edi, [edi + 8]
1195
1196 xloop29:
1197    test       ecx, 1
1198    je         xloop99
1199
1200    // 1 Pixels.
1201    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1202    movd       dword ptr [edi], xmm0
1203 xloop99:
1204
1205    pop        esi
1206    pop        edi
1207    ret
1208  }
1209}
1210
1211// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1212// TODO(fbarchard): Port to Neon
1213
1214// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1215static uvec8 kShuffleColARGB = {
1216  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1217  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1218};
1219
1220// Shuffle table for duplicating 2 fractions into 8 bytes each
1221static uvec8 kShuffleFractions = {
1222  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1223};
1224
1225__declspec(naked)
1226void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1227                               int dst_width, int x, int dx) {
1228  __asm {
1229    push       esi
1230    push       edi
1231    mov        edi, [esp + 8 + 4]    // dst_argb
1232    mov        esi, [esp + 8 + 8]    // src_argb
1233    mov        ecx, [esp + 8 + 12]   // dst_width
1234    movd       xmm2, [esp + 8 + 16]  // x
1235    movd       xmm3, [esp + 8 + 20]  // dx
1236    movdqa     xmm4, kShuffleColARGB
1237    movdqa     xmm5, kShuffleFractions
1238    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
1239    psrlw      xmm6, 9
1240    pextrw     eax, xmm2, 1         // get x0 integer. preroll
1241    sub        ecx, 2
1242    jl         xloop29
1243
1244    movdqa     xmm0, xmm2           // x1 = x0 + dx
1245    paddd      xmm0, xmm3
1246    punpckldq  xmm2, xmm0           // x0 x1
1247    punpckldq  xmm3, xmm3           // dx dx
1248    paddd      xmm3, xmm3           // dx * 2, dx * 2
1249    pextrw     edx, xmm2, 3         // get x1 integer. preroll
1250
1251    // 2 Pixel loop.
1252  xloop2:
1253    movdqa     xmm1, xmm2           // x0, x1 fractions.
1254    paddd      xmm2, xmm3           // x += dx
1255    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1256    psrlw      xmm1, 9              // 7 bit fractions.
1257    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1258    pshufb     xmm1, xmm5           // 0000000011111111
1259    pshufb     xmm0, xmm4           // arrange pixels into pairs
1260    pxor       xmm1, xmm6           // 0..7f and 7f..0
1261    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
1262    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
1263    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
1264    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
1265    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
1266    movq       qword ptr [edi], xmm0
1267    lea        edi, [edi + 8]
1268    sub        ecx, 2               // 2 pixels
1269    jge        xloop2
1270
1271 xloop29:
1272
1273    add        ecx, 2 - 1
1274    jl         xloop99
1275
1276    // 1 pixel remainder
1277    psrlw      xmm2, 9              // 7 bit fractions.
1278    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1279    pshufb     xmm2, xmm5           // 00000000
1280    pshufb     xmm0, xmm4           // arrange pixels into pairs
1281    pxor       xmm2, xmm6           // 0..7f and 7f..0
1282    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
1283    psrlw      xmm0, 7
1284    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
1285    movd       [edi], xmm0
1286
1287 xloop99:
1288
1289    pop        edi
1290    pop        esi
1291    ret
1292  }
1293}
1294
1295// Reads 4 pixels, duplicates them and writes 8 pixels.
1296__declspec(naked)
1297void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1298                           int dst_width, int x, int dx) {
1299  __asm {
1300    mov        edx, [esp + 4]    // dst_argb
1301    mov        eax, [esp + 8]    // src_argb
1302    mov        ecx, [esp + 12]   // dst_width
1303
1304  wloop:
1305    movdqu     xmm0, [eax]
1306    lea        eax,  [eax + 16]
1307    movdqa     xmm1, xmm0
1308    punpckldq  xmm0, xmm0
1309    punpckhdq  xmm1, xmm1
1310    movdqu     [edx], xmm0
1311    movdqu     [edx + 16], xmm1
1312    lea        edx, [edx + 32]
1313    sub        ecx, 8
1314    jg         wloop
1315
1316    ret
1317  }
1318}
1319
1320// Divide num by div and return as 16.16 fixed point result.
1321__declspec(naked)
1322int FixedDiv_X86(int num, int div) {
1323  __asm {
1324    mov        eax, [esp + 4]    // num
1325    cdq                          // extend num to 64 bits
1326    shld       edx, eax, 16      // 32.16
1327    shl        eax, 16
1328    idiv       dword ptr [esp + 8]
1329    ret
1330  }
1331}
1332
1333// Divide num by div and return as 16.16 fixed point result.
1334__declspec(naked)
1335int FixedDiv1_X86(int num, int div) {
1336  __asm {
1337    mov        eax, [esp + 4]    // num
1338    mov        ecx, [esp + 8]    // denom
1339    cdq                          // extend num to 64 bits
1340    shld       edx, eax, 16      // 32.16
1341    shl        eax, 16
1342    sub        eax, 0x00010001
1343    sbb        edx, 0
1344    sub        ecx, 1
1345    idiv       ecx
1346    ret
1347  }
1348}
1349#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1350
1351#ifdef __cplusplus
1352}  // extern "C"
1353}  // namespace libyuv
1354#endif
1355