quantize_sse2.asm revision 538f6170b788de7408b06efc6613dc98579aa6a6
1f71323e297a928af368937089d3ed71239786f86Andreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3f71323e297a928af368937089d3ed71239786f86Andreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license and patent
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  grant that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. All contributing project authors may be found in the AUTHORS
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  file in the root of the source tree.
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;
9f71323e297a928af368937089d3ed71239786f86Andreas Huber
10f71323e297a928af368937089d3ed71239786f86Andreas Huber
11f71323e297a928af368937089d3ed71239786f86Andreas Huber%include "vpx_ports/x86_abi_support.asm"
12f71323e297a928af368937089d3ed71239786f86Andreas Huber
13f71323e297a928af368937089d3ed71239786f86Andreas Huber
14f71323e297a928af368937089d3ed71239786f86Andreas Huber;int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr, short *zbin_ptr,
15f71323e297a928af368937089d3ed71239786f86Andreas Huber;               short *qcoeff_ptr,short *dequant_ptr,
16f71323e297a928af368937089d3ed71239786f86Andreas Huber;               const int *default_zig_zag, short *round_ptr,
17f71323e297a928af368937089d3ed71239786f86Andreas Huber;               short *quant_ptr, short *dqcoeff_ptr,
18f71323e297a928af368937089d3ed71239786f86Andreas Huber;               unsigned short zbin_oq_value,
19f71323e297a928af368937089d3ed71239786f86Andreas Huber;               short *zbin_boost_ptr);
20f71323e297a928af368937089d3ed71239786f86Andreas Huber;
21f71323e297a928af368937089d3ed71239786f86Andreas Huberglobal sym(vp8_regular_quantize_b_impl_sse2)
22f71323e297a928af368937089d3ed71239786f86Andreas Hubersym(vp8_regular_quantize_b_impl_sse2):
23f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbp
24f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbp, rsp
25f71323e297a928af368937089d3ed71239786f86Andreas Huber    SHADOW_ARGS_TO_STACK 10
26f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rsi
27f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rdi
28f71323e297a928af368937089d3ed71239786f86Andreas Huber    push        rbx
29f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; end prolog
30f71323e297a928af368937089d3ed71239786f86Andreas Huber
31f71323e297a928af368937089d3ed71239786f86Andreas Huber    ALIGN_STACK 16, rax
32f71323e297a928af368937089d3ed71239786f86Andreas Huber
33f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define abs_minus_zbin_lo 0
34f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define abs_minus_zbin_hi 16
35f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define temp_qcoeff_lo 32
36f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define temp_qcoeff_hi 48
37f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define save_xmm6 64
38f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define save_xmm7 80
39f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define eob 96
40f71323e297a928af368937089d3ed71239786f86Andreas Huber
41f71323e297a928af368937089d3ed71239786f86Andreas Huber    %define vp8_regularquantizeb_stack_size eob + 16
42f71323e297a928af368937089d3ed71239786f86Andreas Huber
43f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         rsp, vp8_regularquantizeb_stack_size
44f71323e297a928af368937089d3ed71239786f86Andreas Huber
45f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsp + save_xmm6], xmm6
46f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsp + save_xmm7], xmm7
47f71323e297a928af368937089d3ed71239786f86Andreas Huber
48f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rdx, arg(0)                 ;coeff_ptr
49f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         eax, arg(8)                 ;zbin_oq_value
50f71323e297a928af368937089d3ed71239786f86Andreas Huber
51f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rcx, arg(1)                 ;zbin_ptr
52f71323e297a928af368937089d3ed71239786f86Andreas Huber    movd        xmm7, eax
53f71323e297a928af368937089d3ed71239786f86Andreas Huber
54f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm0, OWORD PTR[rdx]
55f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm4, OWORD PTR[rdx + 16]
56f71323e297a928af368937089d3ed71239786f86Andreas Huber
57f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm1, xmm0
58f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm5, xmm4
59f71323e297a928af368937089d3ed71239786f86Andreas Huber
60f71323e297a928af368937089d3ed71239786f86Andreas Huber    psraw       xmm0, 15                    ;sign of z (aka sz)
61f71323e297a928af368937089d3ed71239786f86Andreas Huber    psraw       xmm4, 15                    ;sign of z (aka sz)
62f71323e297a928af368937089d3ed71239786f86Andreas Huber
63f71323e297a928af368937089d3ed71239786f86Andreas Huber    pxor        xmm1, xmm0
64f71323e297a928af368937089d3ed71239786f86Andreas Huber    pxor        xmm5, xmm4
65f71323e297a928af368937089d3ed71239786f86Andreas Huber
66f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm2, OWORD PTR[rcx]        ;load zbin_ptr
67f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm3, OWORD PTR[rcx + 16]   ;load zbin_ptr
68f71323e297a928af368937089d3ed71239786f86Andreas Huber
69f71323e297a928af368937089d3ed71239786f86Andreas Huber    pshuflw     xmm7, xmm7, 0
70f71323e297a928af368937089d3ed71239786f86Andreas Huber    psubw       xmm1, xmm0                  ;x = abs(z)
71f71323e297a928af368937089d3ed71239786f86Andreas Huber
72f71323e297a928af368937089d3ed71239786f86Andreas Huber    punpcklwd   xmm7, xmm7                  ;duplicated zbin_oq_value
73f71323e297a928af368937089d3ed71239786f86Andreas Huber    psubw       xmm5, xmm4                  ;x = abs(z)
74f71323e297a928af368937089d3ed71239786f86Andreas Huber
75f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm2, xmm7
76f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm3, xmm7
77f71323e297a928af368937089d3ed71239786f86Andreas Huber
78f71323e297a928af368937089d3ed71239786f86Andreas Huber    psubw       xmm1, xmm2                  ;sub (zbin_ptr + zbin_oq_value)
79f71323e297a928af368937089d3ed71239786f86Andreas Huber    psubw       xmm5, xmm3                  ;sub (zbin_ptr + zbin_oq_value)
80f71323e297a928af368937089d3ed71239786f86Andreas Huber
81f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rdi, arg(5)                 ;round_ptr
82f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(6)                 ;quant_ptr
83f71323e297a928af368937089d3ed71239786f86Andreas Huber
84f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsp + abs_minus_zbin_lo], xmm1
85f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsp + abs_minus_zbin_hi], xmm5
86f71323e297a928af368937089d3ed71239786f86Andreas Huber
87f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm1, xmm2                  ;add (zbin_ptr + zbin_oq_value) back
88f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm5, xmm3                  ;add (zbin_ptr + zbin_oq_value) back
89f71323e297a928af368937089d3ed71239786f86Andreas Huber
90f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm2, OWORD PTR[rdi]
91f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm3, OWORD PTR[rsi]
92f71323e297a928af368937089d3ed71239786f86Andreas Huber
93f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm6, OWORD PTR[rdi + 16]
94f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm7, OWORD PTR[rsi + 16]
95f71323e297a928af368937089d3ed71239786f86Andreas Huber
96f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm1, xmm2
97f71323e297a928af368937089d3ed71239786f86Andreas Huber    paddw       xmm5, xmm6
98f71323e297a928af368937089d3ed71239786f86Andreas Huber
99f71323e297a928af368937089d3ed71239786f86Andreas Huber    pmulhw      xmm1, xmm3
100f71323e297a928af368937089d3ed71239786f86Andreas Huber    pmulhw      xmm5, xmm7
101f71323e297a928af368937089d3ed71239786f86Andreas Huber
102f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(2)                 ;qcoeff_ptr
103f71323e297a928af368937089d3ed71239786f86Andreas Huber    pxor        xmm6, xmm6
104f71323e297a928af368937089d3ed71239786f86Andreas Huber
105f71323e297a928af368937089d3ed71239786f86Andreas Huber    pxor        xmm1, xmm0
106f71323e297a928af368937089d3ed71239786f86Andreas Huber    pxor        xmm5, xmm4
107f71323e297a928af368937089d3ed71239786f86Andreas Huber
108f71323e297a928af368937089d3ed71239786f86Andreas Huber    psubw       xmm1, xmm0
109f71323e297a928af368937089d3ed71239786f86Andreas Huber    psubw       xmm5, xmm4
110f71323e297a928af368937089d3ed71239786f86Andreas Huber
111f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsp + temp_qcoeff_lo], xmm1
112f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsp + temp_qcoeff_hi], xmm5
113f71323e297a928af368937089d3ed71239786f86Andreas Huber
114f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsi], xmm6        ;zero qcoeff
115f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsi + 16], xmm6   ;zero qcoeff
116f71323e297a928af368937089d3ed71239786f86Andreas Huber
117f71323e297a928af368937089d3ed71239786f86Andreas Huber    xor         rax, rax
118f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rcx, -1
119f71323e297a928af368937089d3ed71239786f86Andreas Huber
120f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         [rsp + eob], rcx
121f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(9)                 ;zbin_boost_ptr
122f71323e297a928af368937089d3ed71239786f86Andreas Huber
123f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rbx, arg(4)                 ;default_zig_zag
124f71323e297a928af368937089d3ed71239786f86Andreas Huber
125f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_loop:
126f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsxd      rcx, DWORD PTR[rbx + rax*4] ;now we have rc
127f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
128f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
129f71323e297a928af368937089d3ed71239786f86Andreas Huber
130f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
131f71323e297a928af368937089d3ed71239786f86Andreas Huber
132f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         edx, edi                    ;x - zbin
133f71323e297a928af368937089d3ed71239786f86Andreas Huber    jl          rq_zigzag_1
134f71323e297a928af368937089d3ed71239786f86Andreas Huber
135f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rdi, arg(2)                 ;qcoeff_ptr
136f71323e297a928af368937089d3ed71239786f86Andreas Huber
137f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
138f71323e297a928af368937089d3ed71239786f86Andreas Huber
139f71323e297a928af368937089d3ed71239786f86Andreas Huber    cmp         edx, 0
140f71323e297a928af368937089d3ed71239786f86Andreas Huber    je          rq_zigzag_1
141f71323e297a928af368937089d3ed71239786f86Andreas Huber
142f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
143f71323e297a928af368937089d3ed71239786f86Andreas Huber
144f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(9)                 ;zbin_boost_ptr
145f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         [rsp + eob], rax            ;eob = i
146f71323e297a928af368937089d3ed71239786f86Andreas Huber
147f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_1:
148f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
149f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
150f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
151f71323e297a928af368937089d3ed71239786f86Andreas Huber
152f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
153f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rax, [rax + 1]
154f71323e297a928af368937089d3ed71239786f86Andreas Huber
155f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         edx, edi                    ;x - zbin
156f71323e297a928af368937089d3ed71239786f86Andreas Huber    jl          rq_zigzag_1a
157f71323e297a928af368937089d3ed71239786f86Andreas Huber
158f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rdi, arg(2)                 ;qcoeff_ptr
159f71323e297a928af368937089d3ed71239786f86Andreas Huber
160f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
161f71323e297a928af368937089d3ed71239786f86Andreas Huber
162f71323e297a928af368937089d3ed71239786f86Andreas Huber    cmp         edx, 0
163f71323e297a928af368937089d3ed71239786f86Andreas Huber    je          rq_zigzag_1a
164f71323e297a928af368937089d3ed71239786f86Andreas Huber
165f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
166f71323e297a928af368937089d3ed71239786f86Andreas Huber
167f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(9)                 ;zbin_boost_ptr
168f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         [rsp + eob], rax            ;eob = i
169f71323e297a928af368937089d3ed71239786f86Andreas Huber
170f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_1a:
171f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
172f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
173f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
174f71323e297a928af368937089d3ed71239786f86Andreas Huber
175f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
176f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rax, [rax + 1]
177f71323e297a928af368937089d3ed71239786f86Andreas Huber
178f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         edx, edi                    ;x - zbin
179f71323e297a928af368937089d3ed71239786f86Andreas Huber    jl          rq_zigzag_1b
180f71323e297a928af368937089d3ed71239786f86Andreas Huber
181f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rdi, arg(2)                 ;qcoeff_ptr
182f71323e297a928af368937089d3ed71239786f86Andreas Huber
183f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
184f71323e297a928af368937089d3ed71239786f86Andreas Huber
185f71323e297a928af368937089d3ed71239786f86Andreas Huber    cmp         edx, 0
186f71323e297a928af368937089d3ed71239786f86Andreas Huber    je          rq_zigzag_1b
187f71323e297a928af368937089d3ed71239786f86Andreas Huber
188f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
189f71323e297a928af368937089d3ed71239786f86Andreas Huber
190f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(9)                 ;zbin_boost_ptr
191f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         [rsp + eob], rax            ;eob = i
192f71323e297a928af368937089d3ed71239786f86Andreas Huber
193f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_1b:
194f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsxd      rcx, DWORD PTR[rbx + rax*4 + 4]
195f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edi, WORD PTR [rsi]         ;*zbin_boost_ptr aka zbin
196f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rsi, [rsi + 2]              ;zbin_boost_ptr++
197f71323e297a928af368937089d3ed71239786f86Andreas Huber
198f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edx, WORD PTR[rsp + abs_minus_zbin_lo + rcx *2]
199f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rax, [rax + 1]
200f71323e297a928af368937089d3ed71239786f86Andreas Huber
201f71323e297a928af368937089d3ed71239786f86Andreas Huber    sub         edx, edi                    ;x - zbin
202f71323e297a928af368937089d3ed71239786f86Andreas Huber    jl          rq_zigzag_1c
203f71323e297a928af368937089d3ed71239786f86Andreas Huber
204f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rdi, arg(2)                 ;qcoeff_ptr
205f71323e297a928af368937089d3ed71239786f86Andreas Huber
206f71323e297a928af368937089d3ed71239786f86Andreas Huber    movsx       edx, WORD PTR[rsp + temp_qcoeff_lo + rcx *2]
207f71323e297a928af368937089d3ed71239786f86Andreas Huber
208f71323e297a928af368937089d3ed71239786f86Andreas Huber    cmp         edx, 0
209f71323e297a928af368937089d3ed71239786f86Andreas Huber    je          rq_zigzag_1c
210f71323e297a928af368937089d3ed71239786f86Andreas Huber
211f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         WORD PTR[rdi + rcx * 2], dx ;qcoeff_ptr[rc] = temp_qcoeff[rc]
212f71323e297a928af368937089d3ed71239786f86Andreas Huber
213f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(9)                 ;zbin_boost_ptr
214f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         [rsp + eob], rax            ;eob = i
215f71323e297a928af368937089d3ed71239786f86Andreas Huber
216f71323e297a928af368937089d3ed71239786f86Andreas Huberrq_zigzag_1c:
217f71323e297a928af368937089d3ed71239786f86Andreas Huber    lea         rax, [rax + 1]
218f71323e297a928af368937089d3ed71239786f86Andreas Huber
219f71323e297a928af368937089d3ed71239786f86Andreas Huber    cmp         rax, 16
220f71323e297a928af368937089d3ed71239786f86Andreas Huber    jl          rq_zigzag_loop
221f71323e297a928af368937089d3ed71239786f86Andreas Huber
222f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rdi, arg(2)                 ;qcoeff_ptr
223f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rcx, arg(3)                 ;dequant_ptr
224f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rsi, arg(7)                 ;dqcoeff_ptr
225f71323e297a928af368937089d3ed71239786f86Andreas Huber
226f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm2, OWORD PTR[rdi]
227f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm3, OWORD PTR[rdi + 16]
228f71323e297a928af368937089d3ed71239786f86Andreas Huber
229f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm0, OWORD PTR[rcx]
230f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm1, OWORD PTR[rcx + 16]
231f71323e297a928af368937089d3ed71239786f86Andreas Huber
232f71323e297a928af368937089d3ed71239786f86Andreas Huber    pmullw      xmm0, xmm2
233f71323e297a928af368937089d3ed71239786f86Andreas Huber    pmullw      xmm1, xmm3
234f71323e297a928af368937089d3ed71239786f86Andreas Huber
235f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsi], xmm0        ;store dqcoeff
236f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      OWORD PTR[rsi + 16], xmm1   ;store dqcoeff
237f71323e297a928af368937089d3ed71239786f86Andreas Huber
238f71323e297a928af368937089d3ed71239786f86Andreas Huber    mov         rax, [rsp + eob]
239f71323e297a928af368937089d3ed71239786f86Andreas Huber
240f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm6, OWORD PTR[rsp + save_xmm6]
241f71323e297a928af368937089d3ed71239786f86Andreas Huber    movdqa      xmm7, OWORD PTR[rsp + save_xmm7]
242f71323e297a928af368937089d3ed71239786f86Andreas Huber
243f71323e297a928af368937089d3ed71239786f86Andreas Huber    add         rax, 1
244f71323e297a928af368937089d3ed71239786f86Andreas Huber
245f71323e297a928af368937089d3ed71239786f86Andreas Huber    add         rsp, vp8_regularquantizeb_stack_size
246f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rsp
247f71323e297a928af368937089d3ed71239786f86Andreas Huber
248f71323e297a928af368937089d3ed71239786f86Andreas Huber    ; begin epilog
249f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbx
250f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rdi
251f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rsi
252f71323e297a928af368937089d3ed71239786f86Andreas Huber    UNSHADOW_ARGS
253f71323e297a928af368937089d3ed71239786f86Andreas Huber    pop         rbp
254f71323e297a928af368937089d3ed71239786f86Andreas Huber    ret
255538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
256538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
257538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr,
258538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;                           short *qcoeff_ptr,short *dequant_ptr,
259538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;                           short *scan_mask, short *round_ptr,
260538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber;                           short *quant_ptr, short *dqcoeff_ptr);
261538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huberglobal sym(vp8_fast_quantize_b_impl_sse2)
262538f6170b788de7408b06efc6613dc98579aa6a6Andreas Hubersym(vp8_fast_quantize_b_impl_sse2):
263538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rbp
264538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rbp, rsp
265538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    SHADOW_ARGS_TO_STACK 7
266538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rsi
267538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rdi
268538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    push        rbx
269538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; end prolog
270538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
271538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ALIGN_STACK 16, rax
272538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
273538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    %define save_xmm6  0
274538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    %define save_xmm7 16
275538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
276538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    %define vp8_fastquantizeb_stack_size save_xmm7 + 16
277538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
278538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    sub         rsp, vp8_fastquantizeb_stack_size
279538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
280538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      XMMWORD PTR[rsp + save_xmm6], xmm6
281538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      XMMWORD PTR[rsp + save_xmm7], xmm7
282538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
283538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdx, arg(0)                 ;coeff_ptr
284538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rcx, arg(2)                 ;dequant_ptr
285538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rax, arg(3)                 ;scan_mask
286538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdi, arg(4)                 ;round_ptr
287538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rsi, arg(5)                 ;quant_ptr
288538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
289538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm0, XMMWORD PTR[rdx]
290538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm4, XMMWORD PTR[rdx + 16]
291538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
292538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm6, XMMWORD PTR[rdi]      ;round lo
293538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm7, XMMWORD PTR[rdi + 16] ;round hi
294538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
295538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm1, xmm0
296538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm5, xmm4
297538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
298538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psraw       xmm0, 15                    ;sign of z (aka sz)
299538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psraw       xmm4, 15                    ;sign of z (aka sz)
300538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
301538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm1, xmm0
302538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm5, xmm4
303538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm1, xmm0                  ;x = abs(z)
304538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm5, xmm4                  ;x = abs(z)
305538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
306538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw       xmm1, xmm6
307538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddw       xmm5, xmm7
308538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
309538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmulhw      xmm1, XMMWORD PTR[rsi]
310538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
311538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
312538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rdi, arg(1)                 ;qcoeff_ptr
313538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    mov         rsi, arg(6)                 ;dqcoeff_ptr
314538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
315538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm6, XMMWORD PTR[rcx]
316538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm7, XMMWORD PTR[rcx + 16]
317538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
318538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm1, xmm0
319538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm5, xmm4
320538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm1, xmm0
321538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psubw       xmm5, xmm4
322538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
323538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      XMMWORD PTR[rdi], xmm1
324538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      XMMWORD PTR[rdi + 16], xmm5
325538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
326538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmullw      xmm6, xmm1
327538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmullw      xmm7, xmm5
328538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
329538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm2, XMMWORD PTR[rax]
330538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm3, XMMWORD PTR[rax+16];
331538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
332538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm4, xmm4            ;clear all bits
333538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw     xmm1, xmm4
334538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw     xmm5, xmm4
335538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
336538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pcmpeqw     xmm4, xmm4            ;set all bits
337538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm1, xmm4
338538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pxor        xmm5, xmm4
339538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
340538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrlw       xmm1, 15
341538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrlw       xmm5, 15
342538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
343538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd     xmm1, xmm2
344538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pmaddwd     xmm5, xmm3
345538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
346538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq        xmm2, xmm1
347538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq        xmm3, xmm5
348538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
349538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrldq      xmm1, 8
350538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrldq      xmm5, 8
351538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
352538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd       xmm1, xmm5
353538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd       xmm2, xmm3
354538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
355538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd       xmm1, xmm2
356538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq        xmm5, xmm1
357538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
358538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    psrldq      xmm1, 4
359538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    paddd       xmm5, xmm1
360538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
361538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movq        rcx,  xmm5
362538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    and         rcx,  0xffff
363538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
364538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    xor         rdx,  rdx
365538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    sub         rdx,  rcx
366538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
367538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    bsr         rax,  rcx
368538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    inc         rax
369538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
370538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    sar         rdx,  31
371538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    and         rax,  rdx
372538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
373538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      XMMWORD PTR[rsi], xmm6        ;store dqcoeff
374538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      XMMWORD PTR[rsi + 16], xmm7   ;store dqcoeff
375538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
376538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm6, XMMWORD PTR[rsp + save_xmm6]
377538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    movdqa      xmm7, XMMWORD PTR[rsp + save_xmm7]
378538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
379538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    add         rsp, vp8_fastquantizeb_stack_size
380538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rsp
381538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber
382538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ; begin epilog
383538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rbx
384538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rdi
385538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rsi
386538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    UNSHADOW_ARGS
387538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    pop         rbp
388538f6170b788de7408b06efc6613dc98579aa6a6Andreas Huber    ret
389