1; Copyright (c) 2011 The Chromium Authors. All rights reserved.
2; Use of this source code is governed by a BSD-style license that can be
3; found in the LICENSE file.
4
5%include "media/base/simd/media_export.asm"
6%include "third_party/x86inc/x86inc.asm"
7
8;
9; This file uses MMX, SSE2 and instructions.
10;
11  SECTION_TEXT
12  CPU       SSE2
13
14; void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
15;                                  const uint8* u_buf,
16;                                  const uint8* v_buf,
17;                                  uint8* rgb_buf,
18;                                  ptrdiff_t width,
19;                                  ptrdiff_t source_dx);
20%define SYMBOL ScaleYUVToRGB32Row_SSE2_X64
21  EXPORT    SYMBOL
22  align     function_align
23
24mangle(SYMBOL):
25  %assign   stack_offset 0
26  extern    mangle(kCoefficientsRgbY)
27
28; Parameters are in the following order:
29; 1. Y plane
30; 2. U plane
31; 3. V plane
32; 4. ARGB frame
33; 5. Width
34; 6. Source dx
35; 7. Convert table
36
37PROLOGUE  7, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, R1
38
39%define     TABLEq   r10
40%define     Xq       r11
41%define     INDEXq   r12
42%define     COMPq    R1q
43%define     COMPd    R1d
44
45  PUSH      r10
46  PUSH      r11
47  PUSH      r12
48
49  mov TABLEq, R1q
50
51  ; Set Xq index to 0.
52  xor       Xq, Xq
53  jmp       .scaleend
54
55.scaleloop:
56  ; Read UV pixels.
57  mov       INDEXq, Xq
58  sar       INDEXq, 17
59  movzx     COMPd, BYTE [Uq + INDEXq]
60  movq      xmm0, [TABLEq + 2048 + 8 * COMPq]
61  movzx     COMPd, BYTE [Vq + INDEXq]
62  movq      xmm1, [TABLEq + 4096 + 8 * COMPq]
63
64  ; Read first Y pixel.
65  lea       INDEXq, [Xq + SOURCE_DXq] ; INDEXq nows points to next pixel.
66  sar       Xq, 16
67  movzx     COMPd, BYTE [Yq + Xq]
68  paddsw    xmm0, xmm1		      ; Hide a ADD after memory load.
69  movq      xmm1, [TABLEq + 8 * COMPq]
70
71  ;  Read next Y pixel.
72  lea       Xq, [INDEXq + SOURCE_DXq] ; Xq now points to next pixel.
73  sar       INDEXq, 16
74  movzx     COMPd, BYTE [Yq + INDEXq]
75  movq      xmm2, [TABLEq + 8 * COMPq]
76  paddsw    xmm1, xmm0
77  paddsw    xmm2, xmm0
78  shufps    xmm1, xmm2, 0x44          ; Join two pixels into one XMM register
79  psraw     xmm1, 6
80  packuswb  xmm1, xmm1
81  movq      QWORD [ARGBq], xmm1
82  add       ARGBq, 8
83
84.scaleend:
85  sub       WIDTHq, 2
86  jns       .scaleloop
87
88  and       WIDTHq, 1                 ; odd number of pixels?
89  jz        .scaledone
90
91  ; Read U V components.
92  mov       INDEXq, Xq
93  sar       INDEXq, 17
94  movzx     COMPd, BYTE [Uq + INDEXq]
95  movq      xmm0, [TABLEq + 2048 + 8 * COMPq]
96  movzx     COMPd, BYTE [Vq + INDEXq]
97  movq      xmm1, [TABLEq + 4096 + 8 * COMPq]
98  paddsw    xmm0, xmm1
99
100  ; Read one Y component.
101  mov       INDEXq, Xq
102  sar       INDEXq, 16
103  movzx     COMPd, BYTE [Yq + INDEXq]
104  movq      xmm1, [TABLEq + 8 * COMPq]
105  paddsw    xmm1, xmm0
106  psraw     xmm1, 6
107  packuswb  xmm1, xmm1
108  movd      DWORD [ARGBq], xmm1
109
110.scaledone:
111  POP       r12
112  POP       r11
113  POP       r10
114  RET
115