1/*
2 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for Visual C x86.
19#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
20
21// Offsets for source bytes 0 to 9
22static uvec8 kShuf0 =
23  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
24
25// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
26static uvec8 kShuf1 =
27  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
28
29// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
30static uvec8 kShuf2 =
31  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
32
33// Offsets for source bytes 0 to 10
34static uvec8 kShuf01 =
35  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
36
37// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
38static uvec8 kShuf11 =
39  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
40
41// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42static uvec8 kShuf21 =
43  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
44
45// Coefficients for source bytes 0 to 10
46static uvec8 kMadd01 =
47  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
48
49// Coefficients for source bytes 10 to 21
50static uvec8 kMadd11 =
51  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
52
53// Coefficients for source bytes 21 to 31
54static uvec8 kMadd21 =
55  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
56
57// Coefficients for source bytes 21 to 31
58static vec16 kRound34 =
59  { 2, 2, 2, 2, 2, 2, 2, 2 };
60
61static uvec8 kShuf38a =
62  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
63
64static uvec8 kShuf38b =
65  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
66
67// Arrange words 0,3,6 into 0,1,2
68static uvec8 kShufAc =
69  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
70
71// Arrange words 0,3,6 into 3,4,5
72static uvec8 kShufAc3 =
73  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
74
75// Scaling values for boxes of 3x3 and 2x3
76static uvec16 kScaleAc33 =
77  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
78
79// Arrange first value for pixels 0,1,2,3,4,5
80static uvec8 kShufAb0 =
81  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
82
83// Arrange second value for pixels 0,1,2,3,4,5
84static uvec8 kShufAb1 =
85  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
86
87// Arrange third value for pixels 0,1,2,3,4,5
88static uvec8 kShufAb2 =
89  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
90
91// Scaling values for boxes of 3x2 and 2x2
92static uvec16 kScaleAb2 =
93  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
94
95// Reads 32 pixels, throws half away and writes 16 pixels.
96// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
97__declspec(naked) __declspec(align(16))
98void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
99                        uint8* dst_ptr, int dst_width) {
100  __asm {
101    mov        eax, [esp + 4]        // src_ptr
102                                     // src_stride ignored
103    mov        edx, [esp + 12]       // dst_ptr
104    mov        ecx, [esp + 16]       // dst_width
105
106    align      4
107  wloop:
108    movdqa     xmm0, [eax]
109    movdqa     xmm1, [eax + 16]
110    lea        eax,  [eax + 32]
111    psrlw      xmm0, 8               // isolate odd pixels.
112    psrlw      xmm1, 8
113    packuswb   xmm0, xmm1
114    sub        ecx, 16
115    movdqa     [edx], xmm0
116    lea        edx, [edx + 16]
117    jg         wloop
118
119    ret
120  }
121}
122
123// Blends 32x1 rectangle to 16x1.
124// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
125__declspec(naked) __declspec(align(16))
126void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
127                              uint8* dst_ptr, int dst_width) {
128  __asm {
129    mov        eax, [esp + 4]        // src_ptr
130                                     // src_stride
131    mov        edx, [esp + 12]       // dst_ptr
132    mov        ecx, [esp + 16]       // dst_width
133    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
134    psrlw      xmm5, 8
135
136    align      4
137  wloop:
138    movdqa     xmm0, [eax]
139    movdqa     xmm1, [eax + 16]
140    lea        eax,  [eax + 32]
141
142    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
143    psrlw      xmm0, 8
144    movdqa     xmm3, xmm1
145    psrlw      xmm1, 8
146    pand       xmm2, xmm5
147    pand       xmm3, xmm5
148    pavgw      xmm0, xmm2
149    pavgw      xmm1, xmm3
150    packuswb   xmm0, xmm1
151
152    sub        ecx, 16
153    movdqa     [edx], xmm0
154    lea        edx, [edx + 16]
155    jg         wloop
156
157    ret
158  }
159}
160
161// Blends 32x2 rectangle to 16x1.
162// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
163__declspec(naked) __declspec(align(16))
164void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
165                           uint8* dst_ptr, int dst_width) {
166  __asm {
167    push       esi
168    mov        eax, [esp + 4 + 4]    // src_ptr
169    mov        esi, [esp + 4 + 8]    // src_stride
170    mov        edx, [esp + 4 + 12]   // dst_ptr
171    mov        ecx, [esp + 4 + 16]   // dst_width
172    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
173    psrlw      xmm5, 8
174
175    align      4
176  wloop:
177    movdqa     xmm0, [eax]
178    movdqa     xmm1, [eax + 16]
179    movdqa     xmm2, [eax + esi]
180    movdqa     xmm3, [eax + esi + 16]
181    lea        eax,  [eax + 32]
182    pavgb      xmm0, xmm2            // average rows
183    pavgb      xmm1, xmm3
184
185    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
186    psrlw      xmm0, 8
187    movdqa     xmm3, xmm1
188    psrlw      xmm1, 8
189    pand       xmm2, xmm5
190    pand       xmm3, xmm5
191    pavgw      xmm0, xmm2
192    pavgw      xmm1, xmm3
193    packuswb   xmm0, xmm1
194
195    sub        ecx, 16
196    movdqa     [edx], xmm0
197    lea        edx, [edx + 16]
198    jg         wloop
199
200    pop        esi
201    ret
202  }
203}
204
205// Reads 32 pixels, throws half away and writes 16 pixels.
206// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
207__declspec(naked) __declspec(align(16))
208void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
209                                  ptrdiff_t src_stride,
210                                  uint8* dst_ptr, int dst_width) {
211  __asm {
212    mov        eax, [esp + 4]        // src_ptr
213                                     // src_stride ignored
214    mov        edx, [esp + 12]       // dst_ptr
215    mov        ecx, [esp + 16]       // dst_width
216
217    align      4
218  wloop:
219    movdqu     xmm0, [eax]
220    movdqu     xmm1, [eax + 16]
221    lea        eax,  [eax + 32]
222    psrlw      xmm0, 8               // isolate odd pixels.
223    psrlw      xmm1, 8
224    packuswb   xmm0, xmm1
225    sub        ecx, 16
226    movdqu     [edx], xmm0
227    lea        edx, [edx + 16]
228    jg         wloop
229
230    ret
231  }
232}
233
234// Blends 32x1 rectangle to 16x1.
235// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
236__declspec(naked) __declspec(align(16))
237void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
238                                        ptrdiff_t src_stride,
239                                        uint8* dst_ptr, int dst_width) {
240  __asm {
241    mov        eax, [esp + 4]        // src_ptr
242                                     // src_stride
243    mov        edx, [esp + 12]       // dst_ptr
244    mov        ecx, [esp + 16]       // dst_width
245    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
246    psrlw      xmm5, 8
247
248    align      4
249  wloop:
250    movdqu     xmm0, [eax]
251    movdqu     xmm1, [eax + 16]
252    lea        eax,  [eax + 32]
253
254    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
255    psrlw      xmm0, 8
256    movdqa     xmm3, xmm1
257    psrlw      xmm1, 8
258    pand       xmm2, xmm5
259    pand       xmm3, xmm5
260    pavgw      xmm0, xmm2
261    pavgw      xmm1, xmm3
262    packuswb   xmm0, xmm1
263
264    sub        ecx, 16
265    movdqu     [edx], xmm0
266    lea        edx, [edx + 16]
267    jg         wloop
268
269    ret
270  }
271}
272
273// Blends 32x2 rectangle to 16x1.
274// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
275__declspec(naked) __declspec(align(16))
276void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
277                                     ptrdiff_t src_stride,
278                                     uint8* dst_ptr, int dst_width) {
279  __asm {
280    push       esi
281    mov        eax, [esp + 4 + 4]    // src_ptr
282    mov        esi, [esp + 4 + 8]    // src_stride
283    mov        edx, [esp + 4 + 12]   // dst_ptr
284    mov        ecx, [esp + 4 + 16]   // dst_width
285    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
286    psrlw      xmm5, 8
287
288    align      4
289  wloop:
290    movdqu     xmm0, [eax]
291    movdqu     xmm1, [eax + 16]
292    movdqu     xmm2, [eax + esi]
293    movdqu     xmm3, [eax + esi + 16]
294    lea        eax,  [eax + 32]
295    pavgb      xmm0, xmm2            // average rows
296    pavgb      xmm1, xmm3
297
298    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
299    psrlw      xmm0, 8
300    movdqa     xmm3, xmm1
301    psrlw      xmm1, 8
302    pand       xmm2, xmm5
303    pand       xmm3, xmm5
304    pavgw      xmm0, xmm2
305    pavgw      xmm1, xmm3
306    packuswb   xmm0, xmm1
307
308    sub        ecx, 16
309    movdqu     [edx], xmm0
310    lea        edx, [edx + 16]
311    jg         wloop
312
313    pop        esi
314    ret
315  }
316}
317
318// Point samples 32 pixels to 8 pixels.
319// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
320__declspec(naked) __declspec(align(16))
321void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
322                        uint8* dst_ptr, int dst_width) {
323  __asm {
324    mov        eax, [esp + 4]        // src_ptr
325                                     // src_stride ignored
326    mov        edx, [esp + 12]       // dst_ptr
327    mov        ecx, [esp + 16]       // dst_width
328    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
329    psrld      xmm5, 24
330    pslld      xmm5, 16
331
332    align      4
333  wloop:
334    movdqa     xmm0, [eax]
335    movdqa     xmm1, [eax + 16]
336    lea        eax,  [eax + 32]
337    pand       xmm0, xmm5
338    pand       xmm1, xmm5
339    packuswb   xmm0, xmm1
340    psrlw      xmm0, 8
341    packuswb   xmm0, xmm0
342    sub        ecx, 8
343    movq       qword ptr [edx], xmm0
344    lea        edx, [edx + 8]
345    jg         wloop
346
347    ret
348  }
349}
350
351// Blends 32x4 rectangle to 8x1.
352// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
353__declspec(naked) __declspec(align(16))
354void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
355                           uint8* dst_ptr, int dst_width) {
356  __asm {
357    push       esi
358    push       edi
359    mov        eax, [esp + 8 + 4]    // src_ptr
360    mov        esi, [esp + 8 + 8]    // src_stride
361    mov        edx, [esp + 8 + 12]   // dst_ptr
362    mov        ecx, [esp + 8 + 16]   // dst_width
363    lea        edi, [esi + esi * 2]  // src_stride * 3
364    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
365    psrlw      xmm7, 8
366
367    align      4
368  wloop:
369    movdqa     xmm0, [eax]
370    movdqa     xmm1, [eax + 16]
371    movdqa     xmm2, [eax + esi]
372    movdqa     xmm3, [eax + esi + 16]
373    pavgb      xmm0, xmm2            // average rows
374    pavgb      xmm1, xmm3
375    movdqa     xmm2, [eax + esi * 2]
376    movdqa     xmm3, [eax + esi * 2 + 16]
377    movdqa     xmm4, [eax + edi]
378    movdqa     xmm5, [eax + edi + 16]
379    lea        eax, [eax + 32]
380    pavgb      xmm2, xmm4
381    pavgb      xmm3, xmm5
382    pavgb      xmm0, xmm2
383    pavgb      xmm1, xmm3
384
385    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
386    psrlw      xmm0, 8
387    movdqa     xmm3, xmm1
388    psrlw      xmm1, 8
389    pand       xmm2, xmm7
390    pand       xmm3, xmm7
391    pavgw      xmm0, xmm2
392    pavgw      xmm1, xmm3
393    packuswb   xmm0, xmm1
394
395    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
396    psrlw      xmm0, 8
397    pand       xmm2, xmm7
398    pavgw      xmm0, xmm2
399    packuswb   xmm0, xmm0
400
401    sub        ecx, 8
402    movq       qword ptr [edx], xmm0
403    lea        edx, [edx + 8]
404    jg         wloop
405
406    pop        edi
407    pop        esi
408    ret
409  }
410}
411
412// Point samples 32 pixels to 24 pixels.
413// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
414// Then shuffled to do the scaling.
415
416// Note that movdqa+palign may be better than movdqu.
417// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
418__declspec(naked) __declspec(align(16))
419void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
420                          uint8* dst_ptr, int dst_width) {
421  __asm {
422    mov        eax, [esp + 4]        // src_ptr
423                                     // src_stride ignored
424    mov        edx, [esp + 12]       // dst_ptr
425    mov        ecx, [esp + 16]       // dst_width
426    movdqa     xmm3, kShuf0
427    movdqa     xmm4, kShuf1
428    movdqa     xmm5, kShuf2
429
430    align      4
431  wloop:
432    movdqa     xmm0, [eax]
433    movdqa     xmm1, [eax + 16]
434    lea        eax,  [eax + 32]
435    movdqa     xmm2, xmm1
436    palignr    xmm1, xmm0, 8
437    pshufb     xmm0, xmm3
438    pshufb     xmm1, xmm4
439    pshufb     xmm2, xmm5
440    movq       qword ptr [edx], xmm0
441    movq       qword ptr [edx + 8], xmm1
442    movq       qword ptr [edx + 16], xmm2
443    lea        edx, [edx + 24]
444    sub        ecx, 24
445    jg         wloop
446
447    ret
448  }
449}
450
451// Blends 32x2 rectangle to 24x1
452// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
453// Then shuffled to do the scaling.
454
455// Register usage:
456// xmm0 src_row 0
457// xmm1 src_row 1
458// xmm2 shuf 0
459// xmm3 shuf 1
460// xmm4 shuf 2
461// xmm5 madd 0
462// xmm6 madd 1
463// xmm7 kRound34
464
465// Note that movdqa+palign may be better than movdqu.
466// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
467__declspec(naked) __declspec(align(16))
468void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
469                                ptrdiff_t src_stride,
470                                uint8* dst_ptr, int dst_width) {
471  __asm {
472    push       esi
473    mov        eax, [esp + 4 + 4]    // src_ptr
474    mov        esi, [esp + 4 + 8]    // src_stride
475    mov        edx, [esp + 4 + 12]   // dst_ptr
476    mov        ecx, [esp + 4 + 16]   // dst_width
477    movdqa     xmm2, kShuf01
478    movdqa     xmm3, kShuf11
479    movdqa     xmm4, kShuf21
480    movdqa     xmm5, kMadd01
481    movdqa     xmm6, kMadd11
482    movdqa     xmm7, kRound34
483
484    align      4
485  wloop:
486    movdqa     xmm0, [eax]           // pixels 0..7
487    movdqa     xmm1, [eax + esi]
488    pavgb      xmm0, xmm1
489    pshufb     xmm0, xmm2
490    pmaddubsw  xmm0, xmm5
491    paddsw     xmm0, xmm7
492    psrlw      xmm0, 2
493    packuswb   xmm0, xmm0
494    movq       qword ptr [edx], xmm0
495    movdqu     xmm0, [eax + 8]       // pixels 8..15
496    movdqu     xmm1, [eax + esi + 8]
497    pavgb      xmm0, xmm1
498    pshufb     xmm0, xmm3
499    pmaddubsw  xmm0, xmm6
500    paddsw     xmm0, xmm7
501    psrlw      xmm0, 2
502    packuswb   xmm0, xmm0
503    movq       qword ptr [edx + 8], xmm0
504    movdqa     xmm0, [eax + 16]      // pixels 16..23
505    movdqa     xmm1, [eax + esi + 16]
506    lea        eax, [eax + 32]
507    pavgb      xmm0, xmm1
508    pshufb     xmm0, xmm4
509    movdqa     xmm1, kMadd21
510    pmaddubsw  xmm0, xmm1
511    paddsw     xmm0, xmm7
512    psrlw      xmm0, 2
513    packuswb   xmm0, xmm0
514    sub        ecx, 24
515    movq       qword ptr [edx + 16], xmm0
516    lea        edx, [edx + 24]
517    jg         wloop
518
519    pop        esi
520    ret
521  }
522}
523
524// Note that movdqa+palign may be better than movdqu.
525// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
526__declspec(naked) __declspec(align(16))
527void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
528                                ptrdiff_t src_stride,
529                                uint8* dst_ptr, int dst_width) {
530  __asm {
531    push       esi
532    mov        eax, [esp + 4 + 4]    // src_ptr
533    mov        esi, [esp + 4 + 8]    // src_stride
534    mov        edx, [esp + 4 + 12]   // dst_ptr
535    mov        ecx, [esp + 4 + 16]   // dst_width
536    movdqa     xmm2, kShuf01
537    movdqa     xmm3, kShuf11
538    movdqa     xmm4, kShuf21
539    movdqa     xmm5, kMadd01
540    movdqa     xmm6, kMadd11
541    movdqa     xmm7, kRound34
542
543    align      4
544  wloop:
545    movdqa     xmm0, [eax]           // pixels 0..7
546    movdqa     xmm1, [eax + esi]
547    pavgb      xmm1, xmm0
548    pavgb      xmm0, xmm1
549    pshufb     xmm0, xmm2
550    pmaddubsw  xmm0, xmm5
551    paddsw     xmm0, xmm7
552    psrlw      xmm0, 2
553    packuswb   xmm0, xmm0
554    movq       qword ptr [edx], xmm0
555    movdqu     xmm0, [eax + 8]       // pixels 8..15
556    movdqu     xmm1, [eax + esi + 8]
557    pavgb      xmm1, xmm0
558    pavgb      xmm0, xmm1
559    pshufb     xmm0, xmm3
560    pmaddubsw  xmm0, xmm6
561    paddsw     xmm0, xmm7
562    psrlw      xmm0, 2
563    packuswb   xmm0, xmm0
564    movq       qword ptr [edx + 8], xmm0
565    movdqa     xmm0, [eax + 16]      // pixels 16..23
566    movdqa     xmm1, [eax + esi + 16]
567    lea        eax, [eax + 32]
568    pavgb      xmm1, xmm0
569    pavgb      xmm0, xmm1
570    pshufb     xmm0, xmm4
571    movdqa     xmm1, kMadd21
572    pmaddubsw  xmm0, xmm1
573    paddsw     xmm0, xmm7
574    psrlw      xmm0, 2
575    packuswb   xmm0, xmm0
576    sub        ecx, 24
577    movq       qword ptr [edx + 16], xmm0
578    lea        edx, [edx+24]
579    jg         wloop
580
581    pop        esi
582    ret
583  }
584}
585
586// 3/8 point sampler
587
588// Scale 32 pixels to 12
589__declspec(naked) __declspec(align(16))
590void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
591                          uint8* dst_ptr, int dst_width) {
592  __asm {
593    mov        eax, [esp + 4]        // src_ptr
594                                     // src_stride ignored
595    mov        edx, [esp + 12]       // dst_ptr
596    mov        ecx, [esp + 16]       // dst_width
597    movdqa     xmm4, kShuf38a
598    movdqa     xmm5, kShuf38b
599
600    align      4
601  xloop:
602    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
603    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
604    lea        eax, [eax + 32]
605    pshufb     xmm0, xmm4
606    pshufb     xmm1, xmm5
607    paddusb    xmm0, xmm1
608
609    sub        ecx, 12
610    movq       qword ptr [edx], xmm0  // write 12 pixels
611    movhlps    xmm1, xmm0
612    movd       [edx + 8], xmm1
613    lea        edx, [edx + 12]
614    jg         xloop
615
616    ret
617  }
618}
619
620// Scale 16x3 pixels to 6x1 with interpolation
621__declspec(naked) __declspec(align(16))
622void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
623                                ptrdiff_t src_stride,
624                                uint8* dst_ptr, int dst_width) {
625  __asm {
626    push       esi
627    mov        eax, [esp + 4 + 4]    // src_ptr
628    mov        esi, [esp + 4 + 8]    // src_stride
629    mov        edx, [esp + 4 + 12]   // dst_ptr
630    mov        ecx, [esp + 4 + 16]   // dst_width
631    movdqa     xmm2, kShufAc
632    movdqa     xmm3, kShufAc3
633    movdqa     xmm4, kScaleAc33
634    pxor       xmm5, xmm5
635
636    align      4
637  xloop:
638    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
639    movdqa     xmm6, [eax + esi]
640    movhlps    xmm1, xmm0
641    movhlps    xmm7, xmm6
642    punpcklbw  xmm0, xmm5
643    punpcklbw  xmm1, xmm5
644    punpcklbw  xmm6, xmm5
645    punpcklbw  xmm7, xmm5
646    paddusw    xmm0, xmm6
647    paddusw    xmm1, xmm7
648    movdqa     xmm6, [eax + esi * 2]
649    lea        eax, [eax + 16]
650    movhlps    xmm7, xmm6
651    punpcklbw  xmm6, xmm5
652    punpcklbw  xmm7, xmm5
653    paddusw    xmm0, xmm6
654    paddusw    xmm1, xmm7
655
656    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
657    psrldq     xmm0, 2
658    paddusw    xmm6, xmm0
659    psrldq     xmm0, 2
660    paddusw    xmm6, xmm0
661    pshufb     xmm6, xmm2
662
663    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
664    psrldq     xmm1, 2
665    paddusw    xmm7, xmm1
666    psrldq     xmm1, 2
667    paddusw    xmm7, xmm1
668    pshufb     xmm7, xmm3
669    paddusw    xmm6, xmm7
670
671    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
672    packuswb   xmm6, xmm6
673
674    sub        ecx, 6
675    movd       [edx], xmm6           // write 6 pixels
676    psrlq      xmm6, 16
677    movd       [edx + 2], xmm6
678    lea        edx, [edx + 6]
679    jg         xloop
680
681    pop        esi
682    ret
683  }
684}
685
686// Scale 16x2 pixels to 6x1 with interpolation
687__declspec(naked) __declspec(align(16))
688void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
689                                ptrdiff_t src_stride,
690                                uint8* dst_ptr, int dst_width) {
691  __asm {
692    push       esi
693    mov        eax, [esp + 4 + 4]    // src_ptr
694    mov        esi, [esp + 4 + 8]    // src_stride
695    mov        edx, [esp + 4 + 12]   // dst_ptr
696    mov        ecx, [esp + 4 + 16]   // dst_width
697    movdqa     xmm2, kShufAb0
698    movdqa     xmm3, kShufAb1
699    movdqa     xmm4, kShufAb2
700    movdqa     xmm5, kScaleAb2
701
702    align      4
703  xloop:
704    movdqa     xmm0, [eax]           // average 2 rows into xmm0
705    pavgb      xmm0, [eax + esi]
706    lea        eax, [eax + 16]
707
708    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
709    pshufb     xmm1, xmm2
710    movdqa     xmm6, xmm0
711    pshufb     xmm6, xmm3
712    paddusw    xmm1, xmm6
713    pshufb     xmm0, xmm4
714    paddusw    xmm1, xmm0
715
716    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
717    packuswb   xmm1, xmm1
718
719    sub        ecx, 6
720    movd       [edx], xmm1           // write 6 pixels
721    psrlq      xmm1, 16
722    movd       [edx + 2], xmm1
723    lea        edx, [edx + 6]
724    jg         xloop
725
726    pop        esi
727    ret
728  }
729}
730
731// Reads 16xN bytes and produces 16 shorts at a time.
732// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
733__declspec(naked) __declspec(align(16))
734void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
735                       uint16* dst_ptr, int src_width,
736                       int src_height) {
737  __asm {
738    push       esi
739    push       edi
740    push       ebx
741    push       ebp
742    mov        esi, [esp + 16 + 4]   // src_ptr
743    mov        edx, [esp + 16 + 8]   // src_stride
744    mov        edi, [esp + 16 + 12]  // dst_ptr
745    mov        ecx, [esp + 16 + 16]  // dst_width
746    mov        ebx, [esp + 16 + 20]  // height
747    pxor       xmm4, xmm4
748    dec        ebx
749
750    align      4
751  xloop:
752    // first row
753    movdqa     xmm0, [esi]
754    lea        eax, [esi + edx]
755    movdqa     xmm1, xmm0
756    punpcklbw  xmm0, xmm4
757    punpckhbw  xmm1, xmm4
758    lea        esi, [esi + 16]
759    mov        ebp, ebx
760    test       ebp, ebp
761    je         ydone
762
763    // sum remaining rows
764    align      4
765  yloop:
766    movdqa     xmm2, [eax]       // read 16 pixels
767    lea        eax, [eax + edx]  // advance to next row
768    movdqa     xmm3, xmm2
769    punpcklbw  xmm2, xmm4
770    punpckhbw  xmm3, xmm4
771    paddusw    xmm0, xmm2        // sum 16 words
772    paddusw    xmm1, xmm3
773    sub        ebp, 1
774    jg         yloop
775
776    align      4
777  ydone:
778    movdqa     [edi], xmm0
779    movdqa     [edi + 16], xmm1
780    lea        edi, [edi + 32]
781
782    sub        ecx, 16
783    jg         xloop
784
785    pop        ebp
786    pop        ebx
787    pop        edi
788    pop        esi
789    ret
790  }
791}
792
793// Bilinear column filtering. SSSE3 version.
794// TODO(fbarchard): Port to Neon
795// TODO(fbarchard): Switch the following:
796//    xor        ebx, ebx
797//    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
798// To
799//    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
800// when drmemory bug fixed.
801// https://code.google.com/p/drmemory/issues/detail?id=1396
802
803__declspec(naked) __declspec(align(16))
804void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
805                           int dst_width, int x, int dx) {
806  __asm {
807    push       ebx
808    push       esi
809    push       edi
810    mov        edi, [esp + 12 + 4]    // dst_ptr
811    mov        esi, [esp + 12 + 8]    // src_ptr
812    mov        ecx, [esp + 12 + 12]   // dst_width
813    movd       xmm2, [esp + 12 + 16]  // x
814    movd       xmm3, [esp + 12 + 20]  // dx
815    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
816    movd       xmm5, eax
817    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
818    psrlw      xmm6, 9
819    pextrw     eax, xmm2, 1         // get x0 integer. preroll
820    sub        ecx, 2
821    jl         xloop29
822
823    movdqa     xmm0, xmm2           // x1 = x0 + dx
824    paddd      xmm0, xmm3
825    punpckldq  xmm2, xmm0           // x0 x1
826    punpckldq  xmm3, xmm3           // dx dx
827    paddd      xmm3, xmm3           // dx * 2, dx * 2
828    pextrw     edx, xmm2, 3         // get x1 integer. preroll
829
830    // 2 Pixel loop.
831    align      4
832  xloop2:
833    movdqa     xmm1, xmm2           // x0, x1 fractions.
834    paddd      xmm2, xmm3           // x += dx
835    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
836    movd       xmm0, ebx
837    psrlw      xmm1, 9              // 7 bit fractions.
838    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
839    movd       xmm4, ebx
840    pshufb     xmm1, xmm5           // 0011
841    punpcklwd  xmm0, xmm4
842    pxor       xmm1, xmm6           // 0..7f and 7f..0
843    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
844    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
845    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
846    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
847    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
848    movd       ebx, xmm0
849    mov        [edi], bx
850    lea        edi, [edi + 2]
851    sub        ecx, 2               // 2 pixels
852    jge        xloop2
853
854    align      4
855 xloop29:
856
857    add        ecx, 2 - 1
858    jl         xloop99
859
860    // 1 pixel remainder
861    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
862    movd       xmm0, ebx
863    psrlw      xmm2, 9              // 7 bit fractions.
864    pshufb     xmm2, xmm5           // 0011
865    pxor       xmm2, xmm6           // 0..7f and 7f..0
866    pmaddubsw  xmm0, xmm2           // 16 bit
867    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
868    packuswb   xmm0, xmm0           // 8 bits
869    movd       ebx, xmm0
870    mov        [edi], bl
871
872    align      4
873 xloop99:
874
875    pop        edi
876    pop        esi
877    pop        ebx
878    ret
879  }
880}
881
882// Reads 16 pixels, duplicates them and writes 32 pixels.
883// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
884__declspec(naked) __declspec(align(16))
885void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
886                       int dst_width, int x, int dx) {
887  __asm {
888    mov        edx, [esp + 4]    // dst_ptr
889    mov        eax, [esp + 8]    // src_ptr
890    mov        ecx, [esp + 12]   // dst_width
891
892    align      4
893  wloop:
894    movdqa     xmm0, [eax]
895    lea        eax,  [eax + 16]
896    movdqa     xmm1, xmm0
897    punpcklbw  xmm0, xmm0
898    punpckhbw  xmm1, xmm1
899    sub        ecx, 32
900    movdqa     [edx], xmm0
901    movdqa     [edx + 16], xmm1
902    lea        edx, [edx + 32]
903    jg         wloop
904
905    ret
906  }
907}
908
909// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
910// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
911__declspec(naked) __declspec(align(16))
912void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
913                            ptrdiff_t src_stride,
914                            uint8* dst_argb, int dst_width) {
915  __asm {
916    mov        eax, [esp + 4]        // src_argb
917                                     // src_stride ignored
918    mov        edx, [esp + 12]       // dst_argb
919    mov        ecx, [esp + 16]       // dst_width
920
921    align      4
922  wloop:
923    movdqa     xmm0, [eax]
924    movdqa     xmm1, [eax + 16]
925    lea        eax,  [eax + 32]
926    shufps     xmm0, xmm1, 0xdd
927    sub        ecx, 4
928    movdqa     [edx], xmm0
929    lea        edx, [edx + 16]
930    jg         wloop
931
932    ret
933  }
934}
935
936// Blends 8x1 rectangle to 4x1.
937// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
938__declspec(naked) __declspec(align(16))
939void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
940                                  ptrdiff_t src_stride,
941                                  uint8* dst_argb, int dst_width) {
942  __asm {
943    mov        eax, [esp + 4]        // src_argb
944                                     // src_stride ignored
945    mov        edx, [esp + 12]       // dst_argb
946    mov        ecx, [esp + 16]       // dst_width
947
948    align      4
949  wloop:
950    movdqa     xmm0, [eax]
951    movdqa     xmm1, [eax + 16]
952    lea        eax,  [eax + 32]
953    movdqa     xmm2, xmm0
954    shufps     xmm0, xmm1, 0x88      // even pixels
955    shufps     xmm2, xmm1, 0xdd      // odd pixels
956    pavgb      xmm0, xmm2
957    sub        ecx, 4
958    movdqa     [edx], xmm0
959    lea        edx, [edx + 16]
960    jg         wloop
961
962    ret
963  }
964}
965
966// Blends 8x2 rectangle to 4x1.
967// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
968__declspec(naked) __declspec(align(16))
969void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
970                               ptrdiff_t src_stride,
971                               uint8* dst_argb, int dst_width) {
972  __asm {
973    push       esi
974    mov        eax, [esp + 4 + 4]    // src_argb
975    mov        esi, [esp + 4 + 8]    // src_stride
976    mov        edx, [esp + 4 + 12]   // dst_argb
977    mov        ecx, [esp + 4 + 16]   // dst_width
978
979    align      4
980  wloop:
981    movdqa     xmm0, [eax]
982    movdqa     xmm1, [eax + 16]
983    movdqa     xmm2, [eax + esi]
984    movdqa     xmm3, [eax + esi + 16]
985    lea        eax,  [eax + 32]
986    pavgb      xmm0, xmm2            // average rows
987    pavgb      xmm1, xmm3
988    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
989    shufps     xmm0, xmm1, 0x88      // even pixels
990    shufps     xmm2, xmm1, 0xdd      // odd pixels
991    pavgb      xmm0, xmm2
992    sub        ecx, 4
993    movdqa     [edx], xmm0
994    lea        edx, [edx + 16]
995    jg         wloop
996
997    pop        esi
998    ret
999  }
1000}
1001
1002// Reads 4 pixels at a time.
1003// Alignment requirement: dst_argb 16 byte aligned.
1004__declspec(naked) __declspec(align(16))
1005void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
1006                               int src_stepx,
1007                               uint8* dst_argb, int dst_width) {
1008  __asm {
1009    push       ebx
1010    push       edi
1011    mov        eax, [esp + 8 + 4]    // src_argb
1012                                     // src_stride ignored
1013    mov        ebx, [esp + 8 + 12]   // src_stepx
1014    mov        edx, [esp + 8 + 16]   // dst_argb
1015    mov        ecx, [esp + 8 + 20]   // dst_width
1016    lea        ebx, [ebx * 4]
1017    lea        edi, [ebx + ebx * 2]
1018
1019    align      4
1020  wloop:
1021    movd       xmm0, [eax]
1022    movd       xmm1, [eax + ebx]
1023    punpckldq  xmm0, xmm1
1024    movd       xmm2, [eax + ebx * 2]
1025    movd       xmm3, [eax + edi]
1026    lea        eax,  [eax + ebx * 4]
1027    punpckldq  xmm2, xmm3
1028    punpcklqdq xmm0, xmm2
1029    sub        ecx, 4
1030    movdqa     [edx], xmm0
1031    lea        edx, [edx + 16]
1032    jg         wloop
1033
1034    pop        edi
1035    pop        ebx
1036    ret
1037  }
1038}
1039
1040// Blends four 2x2 to 4x1.
1041// Alignment requirement: dst_argb 16 byte aligned.
1042__declspec(naked) __declspec(align(16))
1043void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1044                                  ptrdiff_t src_stride,
1045                                  int src_stepx,
1046                                  uint8* dst_argb, int dst_width) {
1047  __asm {
1048    push       ebx
1049    push       esi
1050    push       edi
1051    mov        eax, [esp + 12 + 4]    // src_argb
1052    mov        esi, [esp + 12 + 8]    // src_stride
1053    mov        ebx, [esp + 12 + 12]   // src_stepx
1054    mov        edx, [esp + 12 + 16]   // dst_argb
1055    mov        ecx, [esp + 12 + 20]   // dst_width
1056    lea        esi, [eax + esi]       // row1 pointer
1057    lea        ebx, [ebx * 4]
1058    lea        edi, [ebx + ebx * 2]
1059
1060    align      4
1061  wloop:
1062    movq       xmm0, qword ptr [eax]  // row0 4 pairs
1063    movhps     xmm0, qword ptr [eax + ebx]
1064    movq       xmm1, qword ptr [eax + ebx * 2]
1065    movhps     xmm1, qword ptr [eax + edi]
1066    lea        eax,  [eax + ebx * 4]
1067    movq       xmm2, qword ptr [esi]  // row1 4 pairs
1068    movhps     xmm2, qword ptr [esi + ebx]
1069    movq       xmm3, qword ptr [esi + ebx * 2]
1070    movhps     xmm3, qword ptr [esi + edi]
1071    lea        esi,  [esi + ebx * 4]
1072    pavgb      xmm0, xmm2            // average rows
1073    pavgb      xmm1, xmm3
1074    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
1075    shufps     xmm0, xmm1, 0x88      // even pixels
1076    shufps     xmm2, xmm1, 0xdd      // odd pixels
1077    pavgb      xmm0, xmm2
1078    sub        ecx, 4
1079    movdqa     [edx], xmm0
1080    lea        edx, [edx + 16]
1081    jg         wloop
1082
1083    pop        edi
1084    pop        esi
1085    pop        ebx
1086    ret
1087  }
1088}
1089
1090// Column scaling unfiltered. SSE2 version.
1091__declspec(naked) __declspec(align(16))
1092void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
1093                        int dst_width, int x, int dx) {
1094  __asm {
1095    push       edi
1096    push       esi
1097    mov        edi, [esp + 8 + 4]    // dst_argb
1098    mov        esi, [esp + 8 + 8]    // src_argb
1099    mov        ecx, [esp + 8 + 12]   // dst_width
1100    movd       xmm2, [esp + 8 + 16]  // x
1101    movd       xmm3, [esp + 8 + 20]  // dx
1102
1103    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
1104    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
1105    paddd      xmm2, xmm0
1106    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
1107    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
1108    paddd      xmm2, xmm0            // x3 x2 x1 x0
1109    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
1110    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
1111
1112    pextrw     eax, xmm2, 1          // get x0 integer.
1113    pextrw     edx, xmm2, 3          // get x1 integer.
1114
1115    cmp        ecx, 0
1116    jle        xloop99
1117    sub        ecx, 4
1118    jl         xloop49
1119
1120    // 4 Pixel loop.
1121    align      4
1122 xloop4:
1123    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1124    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1125    pextrw     eax, xmm2, 5           // get x2 integer.
1126    pextrw     edx, xmm2, 7           // get x3 integer.
1127    paddd      xmm2, xmm3             // x += dx
1128    punpckldq  xmm0, xmm1             // x0 x1
1129
1130    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
1131    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
1132    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
1133    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
1134    punpckldq  xmm1, xmm4             // x2 x3
1135    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
1136    sub        ecx, 4                 // 4 pixels
1137    movdqu     [edi], xmm0
1138    lea        edi, [edi + 16]
1139    jge        xloop4
1140
1141    align      4
1142 xloop49:
1143    test       ecx, 2
1144    je         xloop29
1145
1146    // 2 Pixels.
1147    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
1148    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
1149    pextrw     eax, xmm2, 5           // get x2 integer.
1150    punpckldq  xmm0, xmm1             // x0 x1
1151
1152    movq       qword ptr [edi], xmm0
1153    lea        edi, [edi + 8]
1154
1155 xloop29:
1156    test       ecx, 1
1157    je         xloop99
1158
1159    // 1 Pixels.
1160    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
1161    movd       dword ptr [edi], xmm0
1162    align      4
1163 xloop99:
1164
1165    pop        esi
1166    pop        edi
1167    ret
1168  }
1169}
1170
1171// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
1172// TODO(fbarchard): Port to Neon
1173
1174// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1175static uvec8 kShuffleColARGB = {
1176  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
1177  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
1178};
1179
1180// Shuffle table for duplicating 2 fractions into 8 bytes each
1181static uvec8 kShuffleFractions = {
1182  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1183};
1184
1185__declspec(naked) __declspec(align(16))
1186void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
1187                               int dst_width, int x, int dx) {
1188  __asm {
1189    push       esi
1190    push       edi
1191    mov        edi, [esp + 8 + 4]    // dst_argb
1192    mov        esi, [esp + 8 + 8]    // src_argb
1193    mov        ecx, [esp + 8 + 12]   // dst_width
1194    movd       xmm2, [esp + 8 + 16]  // x
1195    movd       xmm3, [esp + 8 + 20]  // dx
1196    movdqa     xmm4, kShuffleColARGB
1197    movdqa     xmm5, kShuffleFractions
1198    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
1199    psrlw      xmm6, 9
1200    pextrw     eax, xmm2, 1         // get x0 integer. preroll
1201    sub        ecx, 2
1202    jl         xloop29
1203
1204    movdqa     xmm0, xmm2           // x1 = x0 + dx
1205    paddd      xmm0, xmm3
1206    punpckldq  xmm2, xmm0           // x0 x1
1207    punpckldq  xmm3, xmm3           // dx dx
1208    paddd      xmm3, xmm3           // dx * 2, dx * 2
1209    pextrw     edx, xmm2, 3         // get x1 integer. preroll
1210
1211    // 2 Pixel loop.
1212    align      4
1213  xloop2:
1214    movdqa     xmm1, xmm2           // x0, x1 fractions.
1215    paddd      xmm2, xmm3           // x += dx
1216    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1217    psrlw      xmm1, 9              // 7 bit fractions.
1218    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
1219    pshufb     xmm1, xmm5           // 0000000011111111
1220    pshufb     xmm0, xmm4           // arrange pixels into pairs
1221    pxor       xmm1, xmm6           // 0..7f and 7f..0
1222    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
1223    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
1224    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
1225    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
1226    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
1227    movq       qword ptr [edi], xmm0
1228    lea        edi, [edi + 8]
1229    sub        ecx, 2               // 2 pixels
1230    jge        xloop2
1231
1232    align      4
1233 xloop29:
1234
1235    add        ecx, 2 - 1
1236    jl         xloop99
1237
1238    // 1 pixel remainder
1239    psrlw      xmm2, 9              // 7 bit fractions.
1240    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
1241    pshufb     xmm2, xmm5           // 00000000
1242    pshufb     xmm0, xmm4           // arrange pixels into pairs
1243    pxor       xmm2, xmm6           // 0..7f and 7f..0
1244    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
1245    psrlw      xmm0, 7
1246    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
1247    movd       [edi], xmm0
1248
1249    align      4
1250 xloop99:
1251
1252    pop        edi
1253    pop        esi
1254    ret
1255  }
1256}
1257
1258// Reads 4 pixels, duplicates them and writes 8 pixels.
1259// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1260__declspec(naked) __declspec(align(16))
1261void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
1262                           int dst_width, int x, int dx) {
1263  __asm {
1264    mov        edx, [esp + 4]    // dst_argb
1265    mov        eax, [esp + 8]    // src_argb
1266    mov        ecx, [esp + 12]   // dst_width
1267
1268    align      4
1269  wloop:
1270    movdqa     xmm0, [eax]
1271    lea        eax,  [eax + 16]
1272    movdqa     xmm1, xmm0
1273    punpckldq  xmm0, xmm0
1274    punpckhdq  xmm1, xmm1
1275    sub        ecx, 8
1276    movdqa     [edx], xmm0
1277    movdqa     [edx + 16], xmm1
1278    lea        edx, [edx + 32]
1279    jg         wloop
1280
1281    ret
1282  }
1283}
1284
1285// Divide num by div and return as 16.16 fixed point result.
1286__declspec(naked) __declspec(align(16))
1287int FixedDiv_X86(int num, int div) {
1288  __asm {
1289    mov        eax, [esp + 4]    // num
1290    cdq                          // extend num to 64 bits
1291    shld       edx, eax, 16      // 32.16
1292    shl        eax, 16
1293    idiv       dword ptr [esp + 8]
1294    ret
1295  }
1296}
1297
1298// Divide num by div and return as 16.16 fixed point result.
1299__declspec(naked) __declspec(align(16))
1300int FixedDiv1_X86(int num, int div) {
1301  __asm {
1302    mov        eax, [esp + 4]    // num
1303    mov        ecx, [esp + 8]    // denom
1304    cdq                          // extend num to 64 bits
1305    shld       edx, eax, 16      // 32.16
1306    shl        eax, 16
1307    sub        eax, 0x00010001
1308    sbb        edx, 0
1309    sub        ecx, 1
1310    idiv       ecx
1311    ret
1312  }
1313}
1314
1315#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
1316
1317#ifdef __cplusplus
1318}  // extern "C"
1319}  // namespace libyuv
1320#endif
1321