1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
15;                           short *qcoeff_ptr,short *dequant_ptr,
16;                           short *scan_mask, short *round_ptr,
17;                           short *quant_ptr, short *dqcoeff_ptr);
18global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
19sym(vp8_fast_quantize_b_impl_mmx):
20    push        rbp
21    mov         rbp, rsp
22    SHADOW_ARGS_TO_STACK 8
23    push rsi
24    push rdi
25    ; end prolog
26
27
28        mov             rsi,        arg(0) ;coeff_ptr
29        movq            mm0,        [rsi]
30
31        mov             rax,        arg(1) ;zbin_ptr
32        movq            mm1,        [rax]
33
34        movq            mm3,        mm0
35        psraw           mm0,        15
36
37        pxor            mm3,        mm0
38        psubw           mm3,        mm0         ; abs
39
40        movq            mm2,        mm3
41        pcmpgtw         mm1,        mm2
42
43        pandn           mm1,        mm2
44        movq            mm3,        mm1
45
46        mov             rdx,        arg(6) ;quant_ptr
47        movq            mm1,        [rdx]
48
49        mov             rcx,        arg(5) ;round_ptr
50        movq            mm2,        [rcx]
51
52        paddw           mm3,        mm2
53        pmulhuw         mm3,        mm1
54
55        pxor            mm3,        mm0
56        psubw           mm3,        mm0     ;gain the sign back
57
58        mov             rdi,        arg(2) ;qcoeff_ptr
59        movq            mm0,        mm3
60
61        movq            [rdi],      mm3
62
63        mov             rax,        arg(3) ;dequant_ptr
64        movq            mm2,        [rax]
65
66        pmullw          mm3,        mm2
67        mov             rax,        arg(7) ;dqcoeff_ptr
68
69        movq            [rax],      mm3
70
71        ; next 8
72        movq            mm4,        [rsi+8]
73
74        mov             rax,        arg(1) ;zbin_ptr
75        movq            mm5,        [rax+8]
76
77        movq            mm7,        mm4
78        psraw           mm4,        15
79
80        pxor            mm7,        mm4
81        psubw           mm7,        mm4         ; abs
82
83        movq            mm6,        mm7
84        pcmpgtw         mm5,        mm6
85
86        pandn           mm5,        mm6
87        movq            mm7,        mm5
88
89        movq            mm5,        [rdx+8]
90        movq            mm6,        [rcx+8]
91
92        paddw           mm7,        mm6
93        pmulhuw         mm7,        mm5
94
95        pxor            mm7,        mm4
96        psubw           mm7,        mm4;gain the sign back
97
98        mov             rdi,        arg(2) ;qcoeff_ptr
99
100        movq            mm1,        mm7
101        movq            [rdi+8],    mm7
102
103        mov             rax,        arg(3) ;dequant_ptr
104        movq            mm6,        [rax+8]
105
106        pmullw          mm7,        mm6
107        mov             rax,        arg(7) ;dqcoeff_ptr
108
109        movq            [rax+8],    mm7
110
111
112                ; next 8
113        movq            mm4,        [rsi+16]
114
115        mov             rax,        arg(1) ;zbin_ptr
116        movq            mm5,        [rax+16]
117
118        movq            mm7,        mm4
119        psraw           mm4,        15
120
121        pxor            mm7,        mm4
122        psubw           mm7,        mm4         ; abs
123
124        movq            mm6,        mm7
125        pcmpgtw         mm5,        mm6
126
127        pandn           mm5,        mm6
128        movq            mm7,        mm5
129
130        movq            mm5,        [rdx+16]
131        movq            mm6,        [rcx+16]
132
133        paddw           mm7,        mm6
134        pmulhuw         mm7,        mm5
135
136        pxor            mm7,        mm4
137        psubw           mm7,        mm4;gain the sign back
138
139        mov             rdi,        arg(2) ;qcoeff_ptr
140
141        movq            mm1,        mm7
142        movq            [rdi+16],   mm7
143
144        mov             rax,        arg(3) ;dequant_ptr
145        movq            mm6,        [rax+16]
146
147        pmullw          mm7,        mm6
148        mov             rax,        arg(7) ;dqcoeff_ptr
149
150        movq            [rax+16],   mm7
151
152
153                ; next 8
154        movq            mm4,        [rsi+24]
155
156        mov             rax,        arg(1) ;zbin_ptr
157        movq            mm5,        [rax+24]
158
159        movq            mm7,        mm4
160        psraw           mm4,        15
161
162        pxor            mm7,        mm4
163        psubw           mm7,        mm4         ; abs
164
165        movq            mm6,        mm7
166        pcmpgtw         mm5,        mm6
167
168        pandn           mm5,        mm6
169        movq            mm7,        mm5
170
171        movq            mm5,        [rdx+24]
172        movq            mm6,        [rcx+24]
173
174        paddw           mm7,        mm6
175        pmulhuw         mm7,        mm5
176
177        pxor            mm7,        mm4
178        psubw           mm7,        mm4;gain the sign back
179
180        mov             rdi,        arg(2) ;qcoeff_ptr
181
182        movq            mm1,        mm7
183        movq            [rdi+24],   mm7
184
185        mov             rax,        arg(3) ;dequant_ptr
186        movq            mm6,        [rax+24]
187
188        pmullw          mm7,        mm6
189        mov             rax,        arg(7) ;dqcoeff_ptr
190
191        movq            [rax+24],   mm7
192
193
194
195        mov             rdi,        arg(4) ;scan_mask
196        mov             rsi,        arg(2) ;qcoeff_ptr
197
198        pxor            mm5,        mm5
199        pxor            mm7,        mm7
200
201        movq            mm0,        [rsi]
202        movq            mm1,        [rsi+8]
203
204        movq            mm2,        [rdi]
205        movq            mm3,        [rdi+8];
206
207        pcmpeqw         mm0,        mm7
208        pcmpeqw         mm1,        mm7
209
210        pcmpeqw         mm6,        mm6
211        pxor            mm0,        mm6
212
213        pxor            mm1,        mm6
214        psrlw           mm0,        15
215
216        psrlw           mm1,        15
217        pmaddwd         mm0,        mm2
218
219        pmaddwd         mm1,        mm3
220        movq            mm5,        mm0
221
222        paddd           mm5,        mm1
223
224        movq            mm0,        [rsi+16]
225        movq            mm1,        [rsi+24]
226
227        movq            mm2,        [rdi+16]
228        movq            mm3,        [rdi+24];
229
230        pcmpeqw         mm0,        mm7
231        pcmpeqw         mm1,        mm7
232
233        pcmpeqw         mm6,        mm6
234        pxor            mm0,        mm6
235
236        pxor            mm1,        mm6
237        psrlw           mm0,        15
238
239        psrlw           mm1,        15
240        pmaddwd         mm0,        mm2
241
242        pmaddwd         mm1,        mm3
243        paddd           mm5,        mm0
244
245        paddd           mm5,        mm1
246        movq            mm0,        mm5
247
248        psrlq           mm5,        32
249        paddd           mm0,        mm5
250
251        ; eob adjustment begins here
252        movq            rcx,        mm0
253        and             rcx,        0xffff
254
255        xor             rdx,        rdx
256        sub             rdx,        rcx ; rdx=-rcx
257
258        bsr             rax,        rcx
259        inc             rax
260
261        sar             rdx,        31
262        and             rax,        rdx
263        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
264        ; following is kept as reference
265        ;    movq            rcx,        mm0
266        ;    bsr             rax,        rcx
267        ;
268        ;    mov             eob,        rax
269        ;    mov             eee,        rcx
270        ;
271        ;if(eee==0)
272        ;{
273        ;    eob=-1;
274        ;}
275        ;else if(eee<0)
276        ;{
277        ;    eob=15;
278        ;}
279        ;d->eob = eob+1;
280
281    ; begin epilog
282    pop rdi
283    pop rsi
284    UNSHADOW_ARGS
285    pop         rbp
286    ret
287