1c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles); Copyright (c) 2011 The Chromium Authors. All rights reserved.
2c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles); Use of this source code is governed by a BSD-style license that can be
3c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles); found in the LICENSE file.
4c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)%include "media/base/simd/media_export.asm"
690dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)
790dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)  EXPORT    SYMBOL
8c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  align     function_align
9c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
10c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)mangle(SYMBOL):
11c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  %assign   stack_offset 0
120de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  PROLOGUE  7, 7, 3, Y, U, V, A, ARGB, WIDTH, TABLE
13c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  PUSH      WIDTHq
14c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  DEFINE_ARGS Y, U, V, A, ARGB, TABLE, TEMP
150de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  mov       TABLEq, TEMPq
16c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  jmp       .convertend
17c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
18c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).convertloop:
19c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Uq]
20c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm0, [TABLEq + 2048 + 8 * TEMPq]
21c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  add       Uq, 1
22c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
23c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Vq]
24c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq]
25c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  add       Vq, 1
26c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
27c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Yq]
28c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm1, [TABLEq + 8 * TEMPq]
29c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
30c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Yq + 1]
31c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm2, [TABLEq + 8 * TEMPq]
32c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  add       Yq, 2
33c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
34c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  ; Add UV components to Y component.
35c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  paddsw    mm1, mm0
36c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  paddsw    mm2, mm0
37c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
38c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  ; Down shift and then pack.
39c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  psraw     mm1, 6
40c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  psraw     mm2, 6
41c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  packuswb  mm1, mm2
42c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
430de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; Unpack
44c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm0, mm1
45c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  pxor      mm2, mm2
46c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  punpcklbw mm0, mm2
47c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  punpckhbw mm1, mm2
480de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)
490de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; Add one to our alpha values, this is a somewhat unfortunate hack; while
500de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; the pack/unpack above handle saturating any negative numbers to 0, they also
510de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; truncate the alpha value to 255. The math ahead wants to produce the same
520de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; ARGB alpha value as the source pixel in YUVA, but this depends on the alpha
530de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; value in |mm0| and |mm1| being 256, (let A be the source image alpha,
540de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; 256 * A >> 8 == A, whereas 255 * A >> 8 is off by one except at 0).
550de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  mov       TEMPq, 0x00010000
560de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  movd      mm2, TEMPd
570de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  psllq     mm2, 32
580de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  paddsw    mm0, mm2
590de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  paddsw    mm1, mm2
600de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)
610de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; Multiply by alpha value, then repack high bytes of words.
62c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Aq]
63c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm2, [TABLEq + 6144 + 8 * TEMPq]
64c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  pmullw    mm0, mm2
65c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  psrlw     mm0, 8
66c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Aq + 1]
67c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm2, [TABLEq + 6144 + 8 * TEMPq]
68c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  add       Aq, 2
69c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  pmullw    mm1, mm2
70c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  psrlw     mm1, 8
71c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  packuswb  mm0, mm1
72c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
73c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  MOVQ      [ARGBq], mm0
74c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  add       ARGBq, 8
75c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
76c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).convertend:
77c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  sub       dword [rsp], 2
78c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  jns       .convertloop
79c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
80c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  ; If number of pixels is odd then compute it.
81c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  and       dword [rsp], 1
82c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  jz        .convertdone
83c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
84c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Uq]
85c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm0, [TABLEq + 2048 + 8 * TEMPq]
86c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Vq]
87c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  paddsw    mm0, [TABLEq + 4096 + 8 * TEMPq]
88c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Yq]
89c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm1, [TABLEq + 8 * TEMPq]
90c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  paddsw    mm1, mm0
91c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  psraw     mm1, 6
92c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  packuswb  mm1, mm1
93c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
94c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  ; Multiply ARGB by alpha value.
95c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  pxor      mm0, mm0
96c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  punpcklbw mm1, mm0
970de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)
980de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  ; See above note about this hack.
990de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  mov       TEMPq, 0x00010000
1000de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  movd      mm0, TEMPd
1010de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  psllq     mm0, 32
1020de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)  paddsw    mm1, mm0
1030de6073388f4e2780db8536178b129cd8f6ab386Torne (Richard Coles)
104c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movzx     TEMPd, BYTE [Aq]
105c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movq      mm0, [TABLEq + 6144 + 8 * TEMPq]
106c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  pmullw    mm1, mm0
107c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  psrlw     mm1, 8
108c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  packuswb  mm1, mm1
109c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
110c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  movd      [ARGBq], mm1
111c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)
112c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles).convertdone:
113c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  POP       TABLEq
114c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  RET
115