1/*
2 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12#include "libyuv/scale_row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for 32 bit Visual C x86 and clangcl
20#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21
22// Offsets for source bytes 0 to 9
23static uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
24                       128, 128, 128, 128, 128, 128, 128, 128};
25
26// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
27static uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
28                       128, 128, 128, 128, 128, 128, 128, 128};
29
30// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
31static uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
32                       128, 128, 128, 128, 128, 128, 128, 128};
33
34// Offsets for source bytes 0 to 10
35static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
36
37// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
39
40// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
41static uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
42                        10, 11, 12, 13, 13, 14, 14, 15};
43
44// Coefficients for source bytes 0 to 10
45static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
46
47// Coefficients for source bytes 10 to 21
48static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
49
50// Coefficients for source bytes 21 to 31
51static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
52
53// Coefficients for source bytes 21 to 31
54static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
55
56static uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
57                         128, 128, 128, 128, 128, 128, 128, 128};
58
59static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
60                         6,   8,   11,  14,  128, 128, 128, 128};
61
62// Arrange words 0,3,6 into 0,1,2
63static uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
64                        128, 128, 128, 128, 128, 128, 128, 128};
65
66// Arrange words 0,3,6 into 3,4,5
67static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
68                         6,   7,   12,  13,  128, 128, 128, 128};
69
70// Scaling values for boxes of 3x3 and 2x3
71static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
72                            65536 / 9, 65536 / 6, 0,         0};
73
74// Arrange first value for pixels 0,1,2,3,4,5
75static uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
76                         11, 128, 14, 128, 128, 128, 128, 128};
77
78// Arrange second value for pixels 0,1,2,3,4,5
79static uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
80                         12, 128, 15, 128, 128, 128, 128, 128};
81
82// Arrange third value for pixels 0,1,2,3,4,5
83static uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
84                         13, 128, 128, 128, 128, 128, 128, 128};
85
86// Scaling values for boxes of 3x2 and 2x2
87static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
88                           65536 / 3, 65536 / 2, 0,         0};
89
90// Reads 32 pixels, throws half away and writes 16 pixels.
91__declspec(naked) void ScaleRowDown2_SSSE3(const uint8* src_ptr,
92                                           ptrdiff_t src_stride,
93                                           uint8* dst_ptr,
94                                           int dst_width) {
95  __asm {
96    mov        eax, [esp + 4]  // src_ptr
97    // src_stride ignored
98    mov        edx, [esp + 12]  // dst_ptr
99    mov        ecx, [esp + 16]  // dst_width
100
101  wloop:
102    movdqu     xmm0, [eax]
103    movdqu     xmm1, [eax + 16]
104    lea        eax,  [eax + 32]
105    psrlw      xmm0, 8          // isolate odd pixels.
106    psrlw      xmm1, 8
107    packuswb   xmm0, xmm1
108    movdqu     [edx], xmm0
109    lea        edx, [edx + 16]
110    sub        ecx, 16
111    jg         wloop
112
113    ret
114  }
115}
116
117// Blends 32x1 rectangle to 16x1.
118__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
119                                                 ptrdiff_t src_stride,
120                                                 uint8* dst_ptr,
121                                                 int dst_width) {
122  __asm {
123    mov        eax, [esp + 4]  // src_ptr
124    // src_stride
125    mov        edx, [esp + 12]  // dst_ptr
126    mov        ecx, [esp + 16]  // dst_width
127
128    pcmpeqb    xmm4, xmm4  // constant 0x0101
129    psrlw      xmm4, 15
130    packuswb   xmm4, xmm4
131    pxor       xmm5, xmm5  // constant 0
132
133  wloop:
134    movdqu     xmm0, [eax]
135    movdqu     xmm1, [eax + 16]
136    lea        eax,  [eax + 32]
137    pmaddubsw  xmm0, xmm4  // horizontal add
138    pmaddubsw  xmm1, xmm4
139    pavgw      xmm0, xmm5       // (x + 1) / 2
140    pavgw      xmm1, xmm5
141    packuswb   xmm0, xmm1
142    movdqu     [edx], xmm0
143    lea        edx, [edx + 16]
144    sub        ecx, 16
145    jg         wloop
146
147    ret
148  }
149}
150
151// Blends 32x2 rectangle to 16x1.
152__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
153                                              ptrdiff_t src_stride,
154                                              uint8* dst_ptr,
155                                              int dst_width) {
156  __asm {
157    push       esi
158    mov        eax, [esp + 4 + 4]  // src_ptr
159    mov        esi, [esp + 4 + 8]  // src_stride
160    mov        edx, [esp + 4 + 12]  // dst_ptr
161    mov        ecx, [esp + 4 + 16]  // dst_width
162
163    pcmpeqb    xmm4, xmm4  // constant 0x0101
164    psrlw      xmm4, 15
165    packuswb   xmm4, xmm4
166    pxor       xmm5, xmm5  // constant 0
167
168  wloop:
169    movdqu     xmm0, [eax]
170    movdqu     xmm1, [eax + 16]
171    movdqu     xmm2, [eax + esi]
172    movdqu     xmm3, [eax + esi + 16]
173    lea        eax,  [eax + 32]
174    pmaddubsw  xmm0, xmm4  // horizontal add
175    pmaddubsw  xmm1, xmm4
176    pmaddubsw  xmm2, xmm4
177    pmaddubsw  xmm3, xmm4
178    paddw      xmm0, xmm2  // vertical add
179    paddw      xmm1, xmm3
180    psrlw      xmm0, 1
181    psrlw      xmm1, 1
182    pavgw      xmm0, xmm5  // (x + 1) / 2
183    pavgw      xmm1, xmm5
184    packuswb   xmm0, xmm1
185    movdqu     [edx], xmm0
186    lea        edx, [edx + 16]
187    sub        ecx, 16
188    jg         wloop
189
190    pop        esi
191    ret
192  }
193}
194
195#ifdef HAS_SCALEROWDOWN2_AVX2
196// Reads 64 pixels, throws half away and writes 32 pixels.
197__declspec(naked) void ScaleRowDown2_AVX2(const uint8* src_ptr,
198                                          ptrdiff_t src_stride,
199                                          uint8* dst_ptr,
200                                          int dst_width) {
201  __asm {
202    mov        eax, [esp + 4]  // src_ptr
203    // src_stride ignored
204    mov        edx, [esp + 12]  // dst_ptr
205    mov        ecx, [esp + 16]  // dst_width
206
207  wloop:
208    vmovdqu     ymm0, [eax]
209    vmovdqu     ymm1, [eax + 32]
210    lea         eax,  [eax + 64]
211    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
212    vpsrlw      ymm1, ymm1, 8
213    vpackuswb   ymm0, ymm0, ymm1
214    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
215    vmovdqu     [edx], ymm0
216    lea         edx, [edx + 32]
217    sub         ecx, 32
218    jg          wloop
219
220    vzeroupper
221    ret
222  }
223}
224
225// Blends 64x1 rectangle to 32x1.
226__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
227                                                ptrdiff_t src_stride,
228                                                uint8* dst_ptr,
229                                                int dst_width) {
230  __asm {
231    mov         eax, [esp + 4]  // src_ptr
232    // src_stride
233    mov         edx, [esp + 12]  // dst_ptr
234    mov         ecx, [esp + 16]  // dst_width
235
236    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
237    vpsrlw      ymm4, ymm4, 15
238    vpackuswb   ymm4, ymm4, ymm4
239    vpxor       ymm5, ymm5, ymm5  // constant 0
240
241  wloop:
242    vmovdqu     ymm0, [eax]
243    vmovdqu     ymm1, [eax + 32]
244    lea         eax,  [eax + 64]
245    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
246    vpmaddubsw  ymm1, ymm1, ymm4
247    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
248    vpavgw      ymm1, ymm1, ymm5
249    vpackuswb   ymm0, ymm0, ymm1
250    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
251    vmovdqu     [edx], ymm0
252    lea         edx, [edx + 32]
253    sub         ecx, 32
254    jg          wloop
255
256    vzeroupper
257    ret
258  }
259}
260
261// For rounding, average = (sum + 2) / 4
262// becomes average((sum >> 1), 0)
263// Blends 64x2 rectangle to 32x1.
264__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
265                                             ptrdiff_t src_stride,
266                                             uint8* dst_ptr,
267                                             int dst_width) {
268  __asm {
269    push        esi
270    mov         eax, [esp + 4 + 4]  // src_ptr
271    mov         esi, [esp + 4 + 8]  // src_stride
272    mov         edx, [esp + 4 + 12]  // dst_ptr
273    mov         ecx, [esp + 4 + 16]  // dst_width
274
275    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
276    vpsrlw      ymm4, ymm4, 15
277    vpackuswb   ymm4, ymm4, ymm4
278    vpxor       ymm5, ymm5, ymm5  // constant 0
279
280  wloop:
281    vmovdqu     ymm0, [eax]
282    vmovdqu     ymm1, [eax + 32]
283    vmovdqu     ymm2, [eax + esi]
284    vmovdqu     ymm3, [eax + esi + 32]
285    lea         eax,  [eax + 64]
286    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
287    vpmaddubsw  ymm1, ymm1, ymm4
288    vpmaddubsw  ymm2, ymm2, ymm4
289    vpmaddubsw  ymm3, ymm3, ymm4
290    vpaddw      ymm0, ymm0, ymm2  // vertical add
291    vpaddw      ymm1, ymm1, ymm3
292    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
293    vpsrlw      ymm1, ymm1, 1
294    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
295    vpavgw      ymm1, ymm1, ymm5
296    vpackuswb   ymm0, ymm0, ymm1
297    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
298    vmovdqu     [edx], ymm0
299    lea         edx, [edx + 32]
300    sub         ecx, 32
301    jg          wloop
302
303    pop         esi
304    vzeroupper
305    ret
306  }
307}
308#endif  // HAS_SCALEROWDOWN2_AVX2
309
310// Point samples 32 pixels to 8 pixels.
311__declspec(naked) void ScaleRowDown4_SSSE3(const uint8* src_ptr,
312                                           ptrdiff_t src_stride,
313                                           uint8* dst_ptr,
314                                           int dst_width) {
315  __asm {
316    mov        eax, [esp + 4]  // src_ptr
317    // src_stride ignored
318    mov        edx, [esp + 12]  // dst_ptr
319    mov        ecx, [esp + 16]  // dst_width
320    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
321    psrld      xmm5, 24
322    pslld      xmm5, 16
323
324  wloop:
325    movdqu     xmm0, [eax]
326    movdqu     xmm1, [eax + 16]
327    lea        eax,  [eax + 32]
328    pand       xmm0, xmm5
329    pand       xmm1, xmm5
330    packuswb   xmm0, xmm1
331    psrlw      xmm0, 8
332    packuswb   xmm0, xmm0
333    movq       qword ptr [edx], xmm0
334    lea        edx, [edx + 8]
335    sub        ecx, 8
336    jg         wloop
337
338    ret
339  }
340}
341
342// Blends 32x4 rectangle to 8x1.
343__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
344                                              ptrdiff_t src_stride,
345                                              uint8* dst_ptr,
346                                              int dst_width) {
347  __asm {
348    push       esi
349    push       edi
350    mov        eax, [esp + 8 + 4]  // src_ptr
351    mov        esi, [esp + 8 + 8]  // src_stride
352    mov        edx, [esp + 8 + 12]  // dst_ptr
353    mov        ecx, [esp + 8 + 16]  // dst_width
354    lea        edi, [esi + esi * 2]  // src_stride * 3
355    pcmpeqb    xmm4, xmm4  // constant 0x0101
356    psrlw      xmm4, 15
357    movdqa     xmm5, xmm4
358    packuswb   xmm4, xmm4
359    psllw      xmm5, 3  // constant 0x0008
360
361  wloop:
362    movdqu     xmm0, [eax]  // average rows
363    movdqu     xmm1, [eax + 16]
364    movdqu     xmm2, [eax + esi]
365    movdqu     xmm3, [eax + esi + 16]
366    pmaddubsw  xmm0, xmm4  // horizontal add
367    pmaddubsw  xmm1, xmm4
368    pmaddubsw  xmm2, xmm4
369    pmaddubsw  xmm3, xmm4
370    paddw      xmm0, xmm2  // vertical add rows 0, 1
371    paddw      xmm1, xmm3
372    movdqu     xmm2, [eax + esi * 2]
373    movdqu     xmm3, [eax + esi * 2 + 16]
374    pmaddubsw  xmm2, xmm4
375    pmaddubsw  xmm3, xmm4
376    paddw      xmm0, xmm2  // add row 2
377    paddw      xmm1, xmm3
378    movdqu     xmm2, [eax + edi]
379    movdqu     xmm3, [eax + edi + 16]
380    lea        eax, [eax + 32]
381    pmaddubsw  xmm2, xmm4
382    pmaddubsw  xmm3, xmm4
383    paddw      xmm0, xmm2  // add row 3
384    paddw      xmm1, xmm3
385    phaddw     xmm0, xmm1
386    paddw      xmm0, xmm5  // + 8 for round
387    psrlw      xmm0, 4  // /16 for average of 4 * 4
388    packuswb   xmm0, xmm0
389    movq       qword ptr [edx], xmm0
390    lea        edx, [edx + 8]
391    sub        ecx, 8
392    jg         wloop
393
394    pop        edi
395    pop        esi
396    ret
397  }
398}
399
400#ifdef HAS_SCALEROWDOWN4_AVX2
401// Point samples 64 pixels to 16 pixels.
402__declspec(naked) void ScaleRowDown4_AVX2(const uint8* src_ptr,
403                                          ptrdiff_t src_stride,
404                                          uint8* dst_ptr,
405                                          int dst_width) {
406  __asm {
407    mov         eax, [esp + 4]  // src_ptr
408    // src_stride ignored
409    mov         edx, [esp + 12]  // dst_ptr
410    mov         ecx, [esp + 16]  // dst_width
411    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
412    vpsrld      ymm5, ymm5, 24
413    vpslld      ymm5, ymm5, 16
414
415  wloop:
416    vmovdqu     ymm0, [eax]
417    vmovdqu     ymm1, [eax + 32]
418    lea         eax,  [eax + 64]
419    vpand       ymm0, ymm0, ymm5
420    vpand       ymm1, ymm1, ymm5
421    vpackuswb   ymm0, ymm0, ymm1
422    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
423    vpsrlw      ymm0, ymm0, 8
424    vpackuswb   ymm0, ymm0, ymm0
425    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
426    vmovdqu     [edx], xmm0
427    lea         edx, [edx + 16]
428    sub         ecx, 16
429    jg          wloop
430
431    vzeroupper
432    ret
433  }
434}
435
436// Blends 64x4 rectangle to 16x1.
437__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
438                                             ptrdiff_t src_stride,
439                                             uint8* dst_ptr,
440                                             int dst_width) {
441  __asm {
442    push        esi
443    push        edi
444    mov         eax, [esp + 8 + 4]  // src_ptr
445    mov         esi, [esp + 8 + 8]  // src_stride
446    mov         edx, [esp + 8 + 12]  // dst_ptr
447    mov         ecx, [esp + 8 + 16]  // dst_width
448    lea         edi, [esi + esi * 2]  // src_stride * 3
449    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
450    vpsrlw      ymm4, ymm4, 15
451    vpsllw      ymm5, ymm4, 3  // constant 0x0008
452    vpackuswb   ymm4, ymm4, ymm4
453
454  wloop:
455    vmovdqu     ymm0, [eax]  // average rows
456    vmovdqu     ymm1, [eax + 32]
457    vmovdqu     ymm2, [eax + esi]
458    vmovdqu     ymm3, [eax + esi + 32]
459    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
460    vpmaddubsw  ymm1, ymm1, ymm4
461    vpmaddubsw  ymm2, ymm2, ymm4
462    vpmaddubsw  ymm3, ymm3, ymm4
463    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
464    vpaddw      ymm1, ymm1, ymm3
465    vmovdqu     ymm2, [eax + esi * 2]
466    vmovdqu     ymm3, [eax + esi * 2 + 32]
467    vpmaddubsw  ymm2, ymm2, ymm4
468    vpmaddubsw  ymm3, ymm3, ymm4
469    vpaddw      ymm0, ymm0, ymm2  // add row 2
470    vpaddw      ymm1, ymm1, ymm3
471    vmovdqu     ymm2, [eax + edi]
472    vmovdqu     ymm3, [eax + edi + 32]
473    lea         eax,  [eax + 64]
474    vpmaddubsw  ymm2, ymm2, ymm4
475    vpmaddubsw  ymm3, ymm3, ymm4
476    vpaddw      ymm0, ymm0, ymm2  // add row 3
477    vpaddw      ymm1, ymm1, ymm3
478    vphaddw     ymm0, ymm0, ymm1  // mutates
479    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
480    vpaddw      ymm0, ymm0, ymm5  // + 8 for round
481    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
482    vpackuswb   ymm0, ymm0, ymm0
483    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
484    vmovdqu     [edx], xmm0
485    lea         edx, [edx + 16]
486    sub         ecx, 16
487    jg          wloop
488
489    pop        edi
490    pop        esi
491    vzeroupper
492    ret
493  }
494}
495#endif  // HAS_SCALEROWDOWN4_AVX2
496
497// Point samples 32 pixels to 24 pixels.
498// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
499// Then shuffled to do the scaling.
500
501__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr,
502                                            ptrdiff_t src_stride,
503                                            uint8* dst_ptr,
504                                            int dst_width) {
505  __asm {
506    mov        eax, [esp + 4]   // src_ptr
507    // src_stride ignored
508    mov        edx, [esp + 12]  // dst_ptr
509    mov        ecx, [esp + 16]  // dst_width
510    movdqa     xmm3, xmmword ptr kShuf0
511    movdqa     xmm4, xmmword ptr kShuf1
512    movdqa     xmm5, xmmword ptr kShuf2
513
514  wloop:
515    movdqu     xmm0, [eax]
516    movdqu     xmm1, [eax + 16]
517    lea        eax,  [eax + 32]
518    movdqa     xmm2, xmm1
519    palignr    xmm1, xmm0, 8
520    pshufb     xmm0, xmm3
521    pshufb     xmm1, xmm4
522    pshufb     xmm2, xmm5
523    movq       qword ptr [edx], xmm0
524    movq       qword ptr [edx + 8], xmm1
525    movq       qword ptr [edx + 16], xmm2
526    lea        edx, [edx + 24]
527    sub        ecx, 24
528    jg         wloop
529
530    ret
531  }
532}
533
534// Blends 32x2 rectangle to 24x1
535// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
536// Then shuffled to do the scaling.
537
538// Register usage:
539// xmm0 src_row 0
540// xmm1 src_row 1
541// xmm2 shuf 0
542// xmm3 shuf 1
543// xmm4 shuf 2
544// xmm5 madd 0
545// xmm6 madd 1
546// xmm7 kRound34
547
548// Note that movdqa+palign may be better than movdqu.
549__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
550                                                  ptrdiff_t src_stride,
551                                                  uint8* dst_ptr,
552                                                  int dst_width) {
553  __asm {
554    push       esi
555    mov        eax, [esp + 4 + 4]  // src_ptr
556    mov        esi, [esp + 4 + 8]  // src_stride
557    mov        edx, [esp + 4 + 12]  // dst_ptr
558    mov        ecx, [esp + 4 + 16]  // dst_width
559    movdqa     xmm2, xmmword ptr kShuf01
560    movdqa     xmm3, xmmword ptr kShuf11
561    movdqa     xmm4, xmmword ptr kShuf21
562    movdqa     xmm5, xmmword ptr kMadd01
563    movdqa     xmm6, xmmword ptr kMadd11
564    movdqa     xmm7, xmmword ptr kRound34
565
566  wloop:
567    movdqu     xmm0, [eax]  // pixels 0..7
568    movdqu     xmm1, [eax + esi]
569    pavgb      xmm0, xmm1
570    pshufb     xmm0, xmm2
571    pmaddubsw  xmm0, xmm5
572    paddsw     xmm0, xmm7
573    psrlw      xmm0, 2
574    packuswb   xmm0, xmm0
575    movq       qword ptr [edx], xmm0
576    movdqu     xmm0, [eax + 8]  // pixels 8..15
577    movdqu     xmm1, [eax + esi + 8]
578    pavgb      xmm0, xmm1
579    pshufb     xmm0, xmm3
580    pmaddubsw  xmm0, xmm6
581    paddsw     xmm0, xmm7
582    psrlw      xmm0, 2
583    packuswb   xmm0, xmm0
584    movq       qword ptr [edx + 8], xmm0
585    movdqu     xmm0, [eax + 16]  // pixels 16..23
586    movdqu     xmm1, [eax + esi + 16]
587    lea        eax, [eax + 32]
588    pavgb      xmm0, xmm1
589    pshufb     xmm0, xmm4
590    movdqa     xmm1, xmmword ptr kMadd21
591    pmaddubsw  xmm0, xmm1
592    paddsw     xmm0, xmm7
593    psrlw      xmm0, 2
594    packuswb   xmm0, xmm0
595    movq       qword ptr [edx + 16], xmm0
596    lea        edx, [edx + 24]
597    sub        ecx, 24
598    jg         wloop
599
600    pop        esi
601    ret
602  }
603}
604
605// Note that movdqa+palign may be better than movdqu.
606__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
607                                                  ptrdiff_t src_stride,
608                                                  uint8* dst_ptr,
609                                                  int dst_width) {
610  __asm {
611    push       esi
612    mov        eax, [esp + 4 + 4]  // src_ptr
613    mov        esi, [esp + 4 + 8]  // src_stride
614    mov        edx, [esp + 4 + 12]  // dst_ptr
615    mov        ecx, [esp + 4 + 16]  // dst_width
616    movdqa     xmm2, xmmword ptr kShuf01
617    movdqa     xmm3, xmmword ptr kShuf11
618    movdqa     xmm4, xmmword ptr kShuf21
619    movdqa     xmm5, xmmword ptr kMadd01
620    movdqa     xmm6, xmmword ptr kMadd11
621    movdqa     xmm7, xmmword ptr kRound34
622
623  wloop:
624    movdqu     xmm0, [eax]  // pixels 0..7
625    movdqu     xmm1, [eax + esi]
626    pavgb      xmm1, xmm0
627    pavgb      xmm0, xmm1
628    pshufb     xmm0, xmm2
629    pmaddubsw  xmm0, xmm5
630    paddsw     xmm0, xmm7
631    psrlw      xmm0, 2
632    packuswb   xmm0, xmm0
633    movq       qword ptr [edx], xmm0
634    movdqu     xmm0, [eax + 8]  // pixels 8..15
635    movdqu     xmm1, [eax + esi + 8]
636    pavgb      xmm1, xmm0
637    pavgb      xmm0, xmm1
638    pshufb     xmm0, xmm3
639    pmaddubsw  xmm0, xmm6
640    paddsw     xmm0, xmm7
641    psrlw      xmm0, 2
642    packuswb   xmm0, xmm0
643    movq       qword ptr [edx + 8], xmm0
644    movdqu     xmm0, [eax + 16]  // pixels 16..23
645    movdqu     xmm1, [eax + esi + 16]
646    lea        eax, [eax + 32]
647    pavgb      xmm1, xmm0
648    pavgb      xmm0, xmm1
649    pshufb     xmm0, xmm4
650    movdqa     xmm1, xmmword ptr kMadd21
651    pmaddubsw  xmm0, xmm1
652    paddsw     xmm0, xmm7
653    psrlw      xmm0, 2
654    packuswb   xmm0, xmm0
655    movq       qword ptr [edx + 16], xmm0
656    lea        edx, [edx+24]
657    sub        ecx, 24
658    jg         wloop
659
660    pop        esi
661    ret
662  }
663}
664
665// 3/8 point sampler
666
667// Scale 32 pixels to 12
668__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr,
669                                            ptrdiff_t src_stride,
670                                            uint8* dst_ptr,
671                                            int dst_width) {
672  __asm {
673    mov        eax, [esp + 4]  // src_ptr
674    // src_stride ignored
675    mov        edx, [esp + 12]  // dst_ptr
676    mov        ecx, [esp + 16]  // dst_width
677    movdqa     xmm4, xmmword ptr kShuf38a
678    movdqa     xmm5, xmmword ptr kShuf38b
679
680  xloop:
681    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
682    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
683    lea        eax, [eax + 32]
684    pshufb     xmm0, xmm4
685    pshufb     xmm1, xmm5
686    paddusb    xmm0, xmm1
687
688    movq       qword ptr [edx], xmm0       // write 12 pixels
689    movhlps    xmm1, xmm0
690    movd       [edx + 8], xmm1
691    lea        edx, [edx + 12]
692    sub        ecx, 12
693    jg         xloop
694
695    ret
696  }
697}
698
699// Scale 16x3 pixels to 6x1 with interpolation
700__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
701                                                  ptrdiff_t src_stride,
702                                                  uint8* dst_ptr,
703                                                  int dst_width) {
704  __asm {
705    push       esi
706    mov        eax, [esp + 4 + 4]  // src_ptr
707    mov        esi, [esp + 4 + 8]  // src_stride
708    mov        edx, [esp + 4 + 12]  // dst_ptr
709    mov        ecx, [esp + 4 + 16]  // dst_width
710    movdqa     xmm2, xmmword ptr kShufAc
711    movdqa     xmm3, xmmword ptr kShufAc3
712    movdqa     xmm4, xmmword ptr kScaleAc33
713    pxor       xmm5, xmm5
714
715  xloop:
716    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
717    movdqu     xmm6, [eax + esi]
718    movhlps    xmm1, xmm0
719    movhlps    xmm7, xmm6
720    punpcklbw  xmm0, xmm5
721    punpcklbw  xmm1, xmm5
722    punpcklbw  xmm6, xmm5
723    punpcklbw  xmm7, xmm5
724    paddusw    xmm0, xmm6
725    paddusw    xmm1, xmm7
726    movdqu     xmm6, [eax + esi * 2]
727    lea        eax, [eax + 16]
728    movhlps    xmm7, xmm6
729    punpcklbw  xmm6, xmm5
730    punpcklbw  xmm7, xmm5
731    paddusw    xmm0, xmm6
732    paddusw    xmm1, xmm7
733
734    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
735    psrldq     xmm0, 2
736    paddusw    xmm6, xmm0
737    psrldq     xmm0, 2
738    paddusw    xmm6, xmm0
739    pshufb     xmm6, xmm2
740
741    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
742    psrldq     xmm1, 2
743    paddusw    xmm7, xmm1
744    psrldq     xmm1, 2
745    paddusw    xmm7, xmm1
746    pshufb     xmm7, xmm3
747    paddusw    xmm6, xmm7
748
749    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
750    packuswb   xmm6, xmm6
751
752    movd       [edx], xmm6  // write 6 pixels
753    psrlq      xmm6, 16
754    movd       [edx + 2], xmm6
755    lea        edx, [edx + 6]
756    sub        ecx, 6
757    jg         xloop
758
759    pop        esi
760    ret
761  }
762}
763
764// Scale 16x2 pixels to 6x1 with interpolation
765__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
766                                                  ptrdiff_t src_stride,
767                                                  uint8* dst_ptr,
768                                                  int dst_width) {
769  __asm {
770    push       esi
771    mov        eax, [esp + 4 + 4]  // src_ptr
772    mov        esi, [esp + 4 + 8]  // src_stride
773    mov        edx, [esp + 4 + 12]  // dst_ptr
774    mov        ecx, [esp + 4 + 16]  // dst_width
775    movdqa     xmm2, xmmword ptr kShufAb0
776    movdqa     xmm3, xmmword ptr kShufAb1
777    movdqa     xmm4, xmmword ptr kShufAb2
778    movdqa     xmm5, xmmword ptr kScaleAb2
779
780  xloop:
781    movdqu     xmm0, [eax]  // average 2 rows into xmm0
782    movdqu     xmm1, [eax + esi]
783    lea        eax, [eax + 16]
784    pavgb      xmm0, xmm1
785
786    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
787    pshufb     xmm1, xmm2
788    movdqa     xmm6, xmm0
789    pshufb     xmm6, xmm3
790    paddusw    xmm1, xmm6
791    pshufb     xmm0, xmm4
792    paddusw    xmm1, xmm0
793
794    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
795    packuswb   xmm1, xmm1
796
797    movd       [edx], xmm1  // write 6 pixels
798    psrlq      xmm1, 16
799    movd       [edx + 2], xmm1
800    lea        edx, [edx + 6]
801    sub        ecx, 6
802    jg         xloop
803
804    pop        esi
805    ret
806  }
807}
808
809// Reads 16 bytes and accumulates to 16 shorts at a time.
810__declspec(naked) void ScaleAddRow_SSE2(const uint8* src_ptr,
811                                        uint16* dst_ptr,
812                                        int src_width) {
813  __asm {
814    mov        eax, [esp + 4]  // src_ptr
815    mov        edx, [esp + 8]  // dst_ptr
816    mov        ecx, [esp + 12]  // src_width
817    pxor       xmm5, xmm5
818
819    // sum rows
820  xloop:
821    movdqu     xmm3, [eax]  // read 16 bytes
822    lea        eax, [eax + 16]
823    movdqu     xmm0, [edx]  // read 16 words from destination
824    movdqu     xmm1, [edx + 16]
825    movdqa     xmm2, xmm3
826    punpcklbw  xmm2, xmm5
827    punpckhbw  xmm3, xmm5
828    paddusw    xmm0, xmm2  // sum 16 words
829    paddusw    xmm1, xmm3
830    movdqu     [edx], xmm0  // write 16 words to destination
831    movdqu     [edx + 16], xmm1
832    lea        edx, [edx + 32]
833    sub        ecx, 16
834    jg         xloop
835    ret
836  }
837}
838
839#ifdef HAS_SCALEADDROW_AVX2
840// Reads 32 bytes and accumulates to 32 shorts at a time.
841__declspec(naked) void ScaleAddRow_AVX2(const uint8* src_ptr,
842                                        uint16* dst_ptr,
843                                        int src_width) {
844  __asm {
845    mov         eax, [esp + 4]  // src_ptr
846    mov         edx, [esp + 8]  // dst_ptr
847    mov         ecx, [esp + 12]  // src_width
848    vpxor       ymm5, ymm5, ymm5
849
850    // sum rows
851  xloop:
852    vmovdqu     ymm3, [eax]  // read 32 bytes
853    lea         eax, [eax + 32]
854    vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
855    vpunpcklbw  ymm2, ymm3, ymm5
856    vpunpckhbw  ymm3, ymm3, ymm5
857    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
858    vpaddusw    ymm1, ymm3, [edx + 32]
859    vmovdqu     [edx], ymm0  // write 32 words to destination
860    vmovdqu     [edx + 32], ymm1
861    lea         edx, [edx + 64]
862    sub         ecx, 32
863    jg          xloop
864
865    vzeroupper
866    ret
867  }
868}
869#endif  // HAS_SCALEADDROW_AVX2
870
871// Constant for making pixels signed to avoid pmaddubsw
872// saturation.
873static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
874                        0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
875
876// Constant for making pixels unsigned and adding .5 for rounding.
877static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
878                         0x4040, 0x4040, 0x4040, 0x4040};
879
880// Bilinear column filtering. SSSE3 version.
881__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr,
882                                             const uint8* src_ptr,
883                                             int dst_width,
884                                             int x,
885                                             int dx) {
886  __asm {
887    push       ebx
888    push       esi
889    push       edi
890    mov        edi, [esp + 12 + 4]  // dst_ptr
891    mov        esi, [esp + 12 + 8]  // src_ptr
892    mov        ecx, [esp + 12 + 12]  // dst_width
893    movd       xmm2, [esp + 12 + 16]  // x
894    movd       xmm3, [esp + 12 + 20]  // dx
895    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
896    movd       xmm5, eax
897    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
898    psrlw      xmm6, 9
899    pcmpeqb    xmm7, xmm7  // generate 0x0001
900    psrlw      xmm7, 15
901    pextrw     eax, xmm2, 1  // get x0 integer. preroll
902    sub        ecx, 2
903    jl         xloop29
904
905    movdqa     xmm0, xmm2  // x1 = x0 + dx
906    paddd      xmm0, xmm3
907    punpckldq  xmm2, xmm0  // x0 x1
908    punpckldq  xmm3, xmm3  // dx dx
909    paddd      xmm3, xmm3  // dx * 2, dx * 2
910    pextrw     edx, xmm2, 3  // get x1 integer. preroll
911
912    // 2 Pixel loop.
913  xloop2:
914    movdqa     xmm1, xmm2  // x0, x1 fractions.
915    paddd      xmm2, xmm3  // x += dx
916    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
917    movd       xmm0, ebx
918    psrlw      xmm1, 9  // 7 bit fractions.
919    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
920    movd       xmm4, ebx
921    pshufb     xmm1, xmm5  // 0011
922    punpcklwd  xmm0, xmm4
923    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
924    pxor       xmm1, xmm6  // 0..7f and 7f..0
925    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
926    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
927    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
928    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
929    paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
930    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
931    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
932    movd       ebx, xmm1
933    mov        [edi], bx
934    lea        edi, [edi + 2]
935    sub        ecx, 2  // 2 pixels
936    jge        xloop2
937
938 xloop29:
939    add        ecx, 2 - 1
940    jl         xloop99
941
942        // 1 pixel remainder
943    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
944    movd       xmm0, ebx
945    psrlw      xmm2, 9  // 7 bit fractions.
946    pshufb     xmm2, xmm5  // 0011
947    psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
948    pxor       xmm2, xmm6  // 0..7f and 7f..0
949    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
950    pmaddubsw  xmm2, xmm0  // 16 bit
951    paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
952    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
953    packuswb   xmm2, xmm2  // 8 bits
954    movd       ebx, xmm2
955    mov        [edi], bl
956
957 xloop99:
958
959    pop        edi
960    pop        esi
961    pop        ebx
962    ret
963  }
964}
965
966// Reads 16 pixels, duplicates them and writes 32 pixels.
967__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr,
968                                         const uint8* src_ptr,
969                                         int dst_width,
970                                         int x,
971                                         int dx) {
972  __asm {
973    mov        edx, [esp + 4]  // dst_ptr
974    mov        eax, [esp + 8]  // src_ptr
975    mov        ecx, [esp + 12]  // dst_width
976
977  wloop:
978    movdqu     xmm0, [eax]
979    lea        eax,  [eax + 16]
980    movdqa     xmm1, xmm0
981    punpcklbw  xmm0, xmm0
982    punpckhbw  xmm1, xmm1
983    movdqu     [edx], xmm0
984    movdqu     [edx + 16], xmm1
985    lea        edx, [edx + 32]
986    sub        ecx, 32
987    jg         wloop
988
989    ret
990  }
991}
992
993// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
994__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
995                                              ptrdiff_t src_stride,
996                                              uint8* dst_argb,
997                                              int dst_width) {
998  __asm {
999    mov        eax, [esp + 4]   // src_argb
1000    // src_stride ignored
1001    mov        edx, [esp + 12]  // dst_argb
1002    mov        ecx, [esp + 16]  // dst_width
1003
1004  wloop:
1005    movdqu     xmm0, [eax]
1006    movdqu     xmm1, [eax + 16]
1007    lea        eax,  [eax + 32]
1008    shufps     xmm0, xmm1, 0xdd
1009    movdqu     [edx], xmm0
1010    lea        edx, [edx + 16]
1011    sub        ecx, 4
1012    jg         wloop
1013
1014    ret
1015  }
1016}
1017
1018// Blends 8x1 rectangle to 4x1.
1019__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
1020                                                    ptrdiff_t src_stride,
1021                                                    uint8* dst_argb,
1022                                                    int dst_width) {
1023  __asm {
1024    mov        eax, [esp + 4]  // src_argb
1025    // src_stride ignored
1026    mov        edx, [esp + 12]  // dst_argb
1027    mov        ecx, [esp + 16]  // dst_width
1028
1029  wloop:
1030    movdqu     xmm0, [eax]
1031    movdqu     xmm1, [eax + 16]
1032    lea        eax,  [eax + 32]
1033    movdqa     xmm2, xmm0
1034    shufps     xmm0, xmm1, 0x88  // even pixels
1035    shufps     xmm2, xmm1, 0xdd       // odd pixels
1036    pavgb      xmm0, xmm2
1037    movdqu     [edx], xmm0
1038    lea        edx, [edx + 16]
1039    sub        ecx, 4
1040    jg         wloop
1041
1042    ret
1043  }
1044}
1045
1046// Blends 8x2 rectangle to 4x1.
1047__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1048                                                 ptrdiff_t src_stride,
1049                                                 uint8* dst_argb,
1050                                                 int dst_width) {
1051  __asm {
1052    push       esi
1053    mov        eax, [esp + 4 + 4]  // src_argb
1054    mov        esi, [esp + 4 + 8]  // src_stride
1055    mov        edx, [esp + 4 + 12]  // dst_argb
1056    mov        ecx, [esp + 4 + 16]  // dst_width
1057
1058  wloop:
1059    movdqu     xmm0, [eax]
1060    movdqu     xmm1, [eax + 16]
1061    movdqu     xmm2, [eax + esi]
1062    movdqu     xmm3, [eax + esi + 16]
1063    lea        eax,  [eax + 32]
1064    pavgb      xmm0, xmm2  // average rows
1065    pavgb      xmm1, xmm3
1066    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
1067    shufps     xmm0, xmm1, 0x88  // even pixels
1068    shufps     xmm2, xmm1, 0xdd  // odd pixels
1069    pavgb      xmm0, xmm2
1070    movdqu     [edx], xmm0
1071    lea        edx, [edx + 16]
1072    sub        ecx, 4
1073    jg         wloop
1074
1075    pop        esi
1076    ret
1077  }
1078}
1079
1080// Reads 4 pixels at a time.
1081__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
1082                                                 ptrdiff_t src_stride,
1083                                                 int src_stepx,
1084                                                 uint8* dst_argb,
1085                                                 int dst_width) {
1086  __asm {
1087    push       ebx
1088    push       edi
1089    mov        eax, [esp + 8 + 4]   // src_argb
1090    // src_stride ignored
1091    mov        ebx, [esp + 8 + 12]  // src_stepx
1092    mov        edx, [esp + 8 + 16]  // dst_argb
1093    mov        ecx, [esp + 8 + 20]  // dst_width
1094    lea        ebx, [ebx * 4]
1095    lea        edi, [ebx + ebx * 2]
1096
1097  wloop:
1098    movd       xmm0, [eax]
1099    movd       xmm1, [eax + ebx]
1100    punpckldq  xmm0, xmm1
1101    movd       xmm2, [eax + ebx * 2]
1102    movd       xmm3, [eax + edi]
1103    lea        eax,  [eax + ebx * 4]
1104    punpckldq  xmm2, xmm3
1105    punpcklqdq xmm0, xmm2
1106    movdqu     [edx], xmm0
1107    lea        edx, [edx + 16]
1108    sub        ecx, 4
1109    jg         wloop
1110
1111    pop        edi
1112    pop        ebx
1113    ret
1114  }
1115}
1116
1117// Blends four 2x2 to 4x1.
1118__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1119                                                    ptrdiff_t src_stride,
1120                                                    int src_stepx,
1121                                                    uint8* dst_argb,
1122                                                    int dst_width) {
1123  __asm {
1124    push       ebx
1125    push       esi
1126    push       edi
1127    mov        eax, [esp + 12 + 4]  // src_argb
1128    mov        esi, [esp + 12 + 8]  // src_stride
1129    mov        ebx, [esp + 12 + 12]  // src_stepx
1130    mov        edx, [esp + 12 + 16]  // dst_argb
1131    mov        ecx, [esp + 12 + 20]  // dst_width
1132    lea        esi, [eax + esi]  // row1 pointer
1133    lea        ebx, [ebx * 4]
1134    lea        edi, [ebx + ebx * 2]
1135
1136  wloop:
1137    movq       xmm0, qword ptr [eax]  // row0 4 pairs
1138    movhps     xmm0, qword ptr [eax + ebx]
1139    movq       xmm1, qword ptr [eax + ebx * 2]
1140    movhps     xmm1, qword ptr [eax + edi]
1141    lea        eax,  [eax + ebx * 4]
1142    movq       xmm2, qword ptr [esi]  // row1 4 pairs
1143    movhps     xmm2, qword ptr [esi + ebx]
1144    movq       xmm3, qword ptr [esi + ebx * 2]
1145    movhps     xmm3, qword ptr [esi + edi]
1146    lea        esi,  [esi + ebx * 4]
1147    pavgb      xmm0, xmm2  // average rows
1148    pavgb      xmm1, xmm3
1149    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
1150    shufps     xmm0, xmm1, 0x88  // even pixels
1151    shufps     xmm2, xmm1, 0xdd  // odd pixels
1152    pavgb      xmm0, xmm2
1153    movdqu     [edx], xmm0
1154    lea        edx, [edx + 16]
1155    sub        ecx, 4
1156    jg         wloop
1157
1158    pop        edi
1159    pop        esi
1160    pop        ebx
1161    ret
1162  }
1163}
1164
1165// Column scaling unfiltered. SSE2 version.
1166__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb,
1167                                          const uint8* src_argb,
1168                                          int dst_width,
1169                                          int x,
1170                                          int dx) {
1171  __asm {
1172    push       edi
1173    push       esi
1174    mov        edi, [esp + 8 + 4]  // dst_argb
1175    mov        esi, [esp + 8 + 8]  // src_argb
1176    mov        ecx, [esp + 8 + 12]  // dst_width
1177    movd       xmm2, [esp + 8 + 16]  // x
1178    movd       xmm3, [esp + 8 + 20]  // dx
1179
1180    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
1181    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
1182    paddd      xmm2, xmm0
1183    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
1184    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
1185    paddd      xmm2, xmm0  // x3 x2 x1 x0
1186    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
1187    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
1188
1189    pextrw     eax, xmm2, 1  // get x0 integer.
1190    pextrw     edx, xmm2, 3  // get x1 integer.
1191
1192    cmp        ecx, 0
1193    jle        xloop99
1194    sub        ecx, 4
1195    jl         xloop49
1196
1197    // 4 Pixel loop.
1198 xloop4:
1199    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1200    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1201    pextrw     eax, xmm2, 5  // get x2 integer.
1202    pextrw     edx, xmm2, 7  // get x3 integer.
1203    paddd      xmm2, xmm3  // x += dx
1204    punpckldq  xmm0, xmm1  // x0 x1
1205
1206    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1207    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1208    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
1209    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
1210    punpckldq  xmm1, xmm4  // x2 x3
1211    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
1212    movdqu     [edi], xmm0
1213    lea        edi, [edi + 16]
1214    sub        ecx, 4  // 4 pixels
1215    jge        xloop4
1216
1217 xloop49:
1218    test       ecx, 2
1219    je         xloop29
1220
1221    // 2 Pixels.
1222    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1223    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1224    pextrw     eax, xmm2, 5  // get x2 integer.
1225    punpckldq  xmm0, xmm1  // x0 x1
1226
1227    movq       qword ptr [edi], xmm0
1228    lea        edi, [edi + 8]
1229
1230 xloop29:
1231    test       ecx, 1
1232    je         xloop99
1233
1234    // 1 Pixels.
1235    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1236    movd       dword ptr [edi], xmm0
1237 xloop99:
1238
1239    pop        esi
1240    pop        edi
1241    ret
1242  }
1243}
1244
1245// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1246// TODO(fbarchard): Port to Neon
1247
1248// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1249static uvec8 kShuffleColARGB = {
1250    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
1251    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1252};
1253
1254// Shuffle table for duplicating 2 fractions into 8 bytes each
1255static uvec8 kShuffleFractions = {
1256    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1257};
1258
1259__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
1260                                                 const uint8* src_argb,
1261                                                 int dst_width,
1262                                                 int x,
1263                                                 int dx) {
1264  __asm {
1265    push       esi
1266    push       edi
1267    mov        edi, [esp + 8 + 4]  // dst_argb
1268    mov        esi, [esp + 8 + 8]  // src_argb
1269    mov        ecx, [esp + 8 + 12]  // dst_width
1270    movd       xmm2, [esp + 8 + 16]  // x
1271    movd       xmm3, [esp + 8 + 20]  // dx
1272    movdqa     xmm4, xmmword ptr kShuffleColARGB
1273    movdqa     xmm5, xmmword ptr kShuffleFractions
1274    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
1275    psrlw      xmm6, 9
1276    pextrw     eax, xmm2, 1  // get x0 integer. preroll
1277    sub        ecx, 2
1278    jl         xloop29
1279
1280    movdqa     xmm0, xmm2  // x1 = x0 + dx
1281    paddd      xmm0, xmm3
1282    punpckldq  xmm2, xmm0  // x0 x1
1283    punpckldq  xmm3, xmm3  // dx dx
1284    paddd      xmm3, xmm3  // dx * 2, dx * 2
1285    pextrw     edx, xmm2, 3  // get x1 integer. preroll
1286
1287    // 2 Pixel loop.
1288  xloop2:
1289    movdqa     xmm1, xmm2  // x0, x1 fractions.
1290    paddd      xmm2, xmm3  // x += dx
1291    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1292    psrlw      xmm1, 9  // 7 bit fractions.
1293    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1294    pshufb     xmm1, xmm5  // 0000000011111111
1295    pshufb     xmm0, xmm4  // arrange pixels into pairs
1296    pxor       xmm1, xmm6  // 0..7f and 7f..0
1297    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
1298    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
1299    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
1300    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
1301    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
1302    movq       qword ptr [edi], xmm0
1303    lea        edi, [edi + 8]
1304    sub        ecx, 2  // 2 pixels
1305    jge        xloop2
1306
1307 xloop29:
1308
1309    add        ecx, 2 - 1
1310    jl         xloop99
1311
1312        // 1 pixel remainder
1313    psrlw      xmm2, 9  // 7 bit fractions.
1314    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1315    pshufb     xmm2, xmm5  // 00000000
1316    pshufb     xmm0, xmm4  // arrange pixels into pairs
1317    pxor       xmm2, xmm6  // 0..7f and 7f..0
1318    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
1319    psrlw      xmm0, 7
1320    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
1321    movd       [edi], xmm0
1322
1323 xloop99:
1324
1325    pop        edi
1326    pop        esi
1327    ret
1328  }
1329}
1330
1331// Reads 4 pixels, duplicates them and writes 8 pixels.
1332__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
1333                                             const uint8* src_argb,
1334                                             int dst_width,
1335                                             int x,
1336                                             int dx) {
1337  __asm {
1338    mov        edx, [esp + 4]  // dst_argb
1339    mov        eax, [esp + 8]  // src_argb
1340    mov        ecx, [esp + 12]  // dst_width
1341
1342  wloop:
1343    movdqu     xmm0, [eax]
1344    lea        eax,  [eax + 16]
1345    movdqa     xmm1, xmm0
1346    punpckldq  xmm0, xmm0
1347    punpckhdq  xmm1, xmm1
1348    movdqu     [edx], xmm0
1349    movdqu     [edx + 16], xmm1
1350    lea        edx, [edx + 32]
1351    sub        ecx, 8
1352    jg         wloop
1353
1354    ret
1355  }
1356}
1357
1358// Divide num by div and return as 16.16 fixed point result.
1359__declspec(naked) int FixedDiv_X86(int num, int div) {
1360  __asm {
1361    mov        eax, [esp + 4]  // num
1362    cdq  // extend num to 64 bits
1363    shld       edx, eax, 16  // 32.16
1364    shl        eax, 16
1365    idiv       dword ptr [esp + 8]
1366    ret
1367  }
1368}
1369
1370// Divide num by div and return as 16.16 fixed point result.
1371__declspec(naked) int FixedDiv1_X86(int num, int div) {
1372  __asm {
1373    mov        eax, [esp + 4]  // num
1374    mov        ecx, [esp + 8]  // denom
1375    cdq  // extend num to 64 bits
1376    shld       edx, eax, 16  // 32.16
1377    shl        eax, 16
1378    sub        eax, 0x00010001
1379    sbb        edx, 0
1380    sub        ecx, 1
1381    idiv       ecx
1382    ret
1383  }
1384}
1385#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
1386
1387#ifdef __cplusplus
1388}  // extern "C"
1389}  // namespace libyuv
1390#endif
1391