15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)%include "media/base/simd/media_export.asm"
62a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)%include "third_party/x86inc/x86inc.asm"
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); This file uses MMX, SSE2 and instructions.
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SECTION_TEXT
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CPU       SSE2
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); void ScaleYUVToRGB32Row_SSE2_X64(const uint8* y_buf,
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                  const uint8* u_buf,
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                  const uint8* v_buf,
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                  uint8* rgb_buf,
182a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles);                                  ptrdiff_t width,
192a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles);                                  ptrdiff_t source_dx);
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL ScaleYUVToRGB32Row_SSE2_X64
2190dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)  EXPORT    SYMBOL
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  align     function_align
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)mangle(SYMBOL):
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  %assign   stack_offset 0
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  extern    mangle(kCoefficientsRgbY)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Parameters are in the following order:
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 1. Y plane
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2. U plane
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 3. V plane
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 4. ARGB frame
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 5. Width
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 6. Source dx
35010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles); 7. Convert table
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
37010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)PROLOGUE  7, 7, 3, Y, U, V, ARGB, WIDTH, SOURCE_DX, R1
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define     TABLEq   r10
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define     Xq       r11
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define     INDEXq   r12
42010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)%define     COMPq    R1q
43010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)%define     COMPd    R1d
44010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PUSH      r10
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PUSH      r11
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PUSH      r12
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
49010d83a9304c5a91596085d917d248abff47903aTorne (Richard Coles)  mov TABLEq, R1q
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Set Xq index to 0.
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  xor       Xq, Xq
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  jmp       .scaleend
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).scaleloop:
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read UV pixels.
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  mov       INDEXq, Xq
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sar       INDEXq, 17
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     COMPd, BYTE [Uq + INDEXq]
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      xmm0, [TABLEq + 2048 + 8 * COMPq]
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     COMPd, BYTE [Vq + INDEXq]
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      xmm1, [TABLEq + 4096 + 8 * COMPq]
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read first Y pixel.
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  lea       INDEXq, [Xq + SOURCE_DXq] ; INDEXq nows points to next pixel.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sar       Xq, 16
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     COMPd, BYTE [Yq + Xq]
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddsw    xmm0, xmm1		      ; Hide a ADD after memory load.
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      xmm1, [TABLEq + 8 * COMPq]
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ;  Read next Y pixel.
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  lea       Xq, [INDEXq + SOURCE_DXq] ; Xq now points to next pixel.
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sar       INDEXq, 16
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     COMPd, BYTE [Yq + INDEXq]
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      xmm2, [TABLEq + 8 * COMPq]
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddsw    xmm1, xmm0
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddsw    xmm2, xmm0
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  shufps    xmm1, xmm2, 0x44          ; Join two pixels into one XMM register
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  psraw     xmm1, 6
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  packuswb  xmm1, xmm1
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      QWORD [ARGBq], xmm1
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  add       ARGBq, 8
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).scaleend:
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sub       WIDTHq, 2
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  jns       .scaleloop
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  and       WIDTHq, 1                 ; odd number of pixels?
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  jz        .scaledone
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read U V components.
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  mov       INDEXq, Xq
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sar       INDEXq, 17
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     COMPd, BYTE [Uq + INDEXq]
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      xmm0, [TABLEq + 2048 + 8 * COMPq]
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     COMPd, BYTE [Vq + INDEXq]
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      xmm1, [TABLEq + 4096 + 8 * COMPq]
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddsw    xmm0, xmm1
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read one Y component.
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  mov       INDEXq, Xq
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  sar       INDEXq, 16
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     COMPd, BYTE [Yq + INDEXq]
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      xmm1, [TABLEq + 8 * COMPq]
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddsw    xmm1, xmm0
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  psraw     xmm1, 6
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  packuswb  xmm1, xmm1
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movd      DWORD [ARGBq], xmm1
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles).scaledone:
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  POP       r12
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  POP       r11
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  POP       r10
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  RET
115