quantize_mmx.asm revision f71323e297a928af368937089d3ed71239786f86
1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
15;                           short *qcoeff_ptr,short *dequant_ptr,
16;                           short *scan_mask, short *round_ptr,
17;                           short *quant_ptr, short *dqcoeff_ptr);
18global sym(vp8_fast_quantize_b_impl_mmx)
19sym(vp8_fast_quantize_b_impl_mmx):
20    push        rbp
21    mov         rbp, rsp
22    SHADOW_ARGS_TO_STACK 8
23    push rsi
24    push rdi
25    ; end prolog
26
27
28        mov             rsi,        arg(0) ;coeff_ptr
29        movq            mm0,        [rsi]
30
31        mov             rax,        arg(1) ;zbin_ptr
32        movq            mm1,        [rax]
33
34        movq            mm3,        mm0
35        psraw           mm0,        15
36
37        pxor            mm3,        mm0
38        psubw           mm3,        mm0         ; abs
39
40        movq            mm2,        mm3
41        pcmpgtw         mm1,        mm2
42
43        pandn           mm1,        mm2
44        movq            mm3,        mm1
45
46        mov             rdx,        arg(6) ;quant_ptr
47        movq            mm1,        [rdx]
48
49        mov             rcx,        arg(5) ;round_ptr
50        movq            mm2,        [rcx]
51
52        paddw           mm3,        mm2
53        pmulhuw         mm3,        mm1
54
55        pxor            mm3,        mm0
56        psubw           mm3,        mm0     ;gain the sign back
57
58        mov             rdi,        arg(2) ;qcoeff_ptr
59        movq            mm0,        mm3
60
61        movq            [rdi],      mm3
62
63        mov             rax,        arg(3) ;dequant_ptr
64        movq            mm2,        [rax]
65
66        pmullw          mm3,        mm2
67        mov             rax,        arg(7) ;dqcoeff_ptr
68
69        movq            [rax],      mm3
70
71        ; next 8
72        movq            mm4,        [rsi+8]
73
74        mov             rax,        arg(1) ;zbin_ptr
75        movq            mm5,        [rax+8]
76
77        movq            mm7,        mm4
78        psraw           mm4,        15
79
80        pxor            mm7,        mm4
81        psubw           mm7,        mm4         ; abs
82
83        movq            mm6,        mm7
84        pcmpgtw         mm5,        mm6
85
86        pandn           mm5,        mm6
87        movq            mm7,        mm5
88
89        movq            mm5,        [rdx+8]
90        movq            mm6,        [rcx+8]
91
92        paddw           mm7,        mm6
93        pmulhuw         mm7,        mm5
94
95        pxor            mm7,        mm4
96        psubw           mm7,        mm4;gain the sign back
97
98        mov             rdi,        arg(2) ;qcoeff_ptr
99
100        movq            mm1,        mm7
101        movq            [rdi+8],    mm7
102
103        mov             rax,        arg(3) ;dequant_ptr
104        movq            mm6,        [rax+8]
105
106        pmullw          mm7,        mm6
107        mov             rax,        arg(7) ;dqcoeff_ptr
108
109        movq            [rax+8],    mm7
110
111
112                ; next 8
113        movq            mm4,        [rsi+16]
114
115        mov             rax,        arg(1) ;zbin_ptr
116        movq            mm5,        [rax+16]
117
118        movq            mm7,        mm4
119        psraw           mm4,        15
120
121        pxor            mm7,        mm4
122        psubw           mm7,        mm4         ; abs
123
124        movq            mm6,        mm7
125        pcmpgtw         mm5,        mm6
126
127        pandn           mm5,        mm6
128        movq            mm7,        mm5
129
130        movq            mm5,        [rdx+16]
131        movq            mm6,        [rcx+16]
132
133        paddw           mm7,        mm6
134        pmulhuw         mm7,        mm5
135
136        pxor            mm7,        mm4
137        psubw           mm7,        mm4;gain the sign back
138
139        mov             rdi,        arg(2) ;qcoeff_ptr
140
141        movq            mm1,        mm7
142        movq            [rdi+16],   mm7
143
144        mov             rax,        arg(3) ;dequant_ptr
145        movq            mm6,        [rax+16]
146
147        pmullw          mm7,        mm6
148        mov             rax,        arg(7) ;dqcoeff_ptr
149
150        movq            [rax+16],   mm7
151
152
153                ; next 8
154        movq            mm4,        [rsi+24]
155
156        mov             rax,        arg(1) ;zbin_ptr
157        movq            mm5,        [rax+24]
158
159        movq            mm7,        mm4
160        psraw           mm4,        15
161
162        pxor            mm7,        mm4
163        psubw           mm7,        mm4         ; abs
164
165        movq            mm6,        mm7
166        pcmpgtw         mm5,        mm6
167
168        pandn           mm5,        mm6
169        movq            mm7,        mm5
170
171        movq            mm5,        [rdx+24]
172        movq            mm6,        [rcx+24]
173
174        paddw           mm7,        mm6
175        pmulhuw         mm7,        mm5
176
177        pxor            mm7,        mm4
178        psubw           mm7,        mm4;gain the sign back
179
180        mov             rdi,        arg(2) ;qcoeff_ptr
181
182        movq            mm1,        mm7
183        movq            [rdi+24],   mm7
184
185        mov             rax,        arg(3) ;dequant_ptr
186        movq            mm6,        [rax+24]
187
188        pmullw          mm7,        mm6
189        mov             rax,        arg(7) ;dqcoeff_ptr
190
191        movq            [rax+24],   mm7
192
193
194
195        mov             rdi,        arg(4) ;scan_mask
196        mov             rsi,        arg(2) ;qcoeff_ptr
197
198        pxor            mm5,        mm5
199        pxor            mm7,        mm7
200
201        movq            mm0,        [rsi]
202        movq            mm1,        [rsi+8]
203
204        movq            mm2,        [rdi]
205        movq            mm3,        [rdi+8];
206
207        pcmpeqw         mm0,        mm7
208        pcmpeqw         mm1,        mm7
209
210        pcmpeqw         mm6,        mm6
211        pxor            mm0,        mm6
212
213        pxor            mm1,        mm6
214        psrlw           mm0,        15
215
216        psrlw           mm1,        15
217        pmaddwd         mm0,        mm2
218
219        pmaddwd         mm1,        mm3
220        movq            mm5,        mm0
221
222        paddd           mm5,        mm1
223
224        movq            mm0,        [rsi+16]
225        movq            mm1,        [rsi+24]
226
227        movq            mm2,        [rdi+16]
228        movq            mm3,        [rdi+24];
229
230        pcmpeqw         mm0,        mm7
231        pcmpeqw         mm1,        mm7
232
233        pcmpeqw         mm6,        mm6
234        pxor            mm0,        mm6
235
236        pxor            mm1,        mm6
237        psrlw           mm0,        15
238
239        psrlw           mm1,        15
240        pmaddwd         mm0,        mm2
241
242        pmaddwd         mm1,        mm3
243        paddd           mm5,        mm0
244
245        paddd           mm5,        mm1
246        movq            mm0,        mm5
247
248        psrlq           mm5,        32
249        paddd           mm0,        mm5
250
251        ; eob adjustment begins here
252        movd            rcx,        mm0
253        and             rcx,        0xffff
254
255        xor             rdx,        rdx
256        sub             rdx,        rcx ; rdx=-rcx
257
258        bsr             rax,        rcx
259        inc             rax
260
261        sar             rdx,        31
262        and             rax,        rdx
263        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
264        ; following is kept as reference
265        ;    movd            rcx,        mm0
266        ;    bsr             rax,        rcx
267        ;
268        ;    mov             eob,        rax
269        ;    mov             eee,        rcx
270        ;
271        ;if(eee==0)
272        ;{
273        ;    eob=-1;
274        ;}
275        ;else if(eee<0)
276        ;{
277        ;    eob=15;
278        ;}
279        ;d->eob = eob+1;
280
281    ; begin epilog
282    pop rdi
283    pop rsi
284    UNSHADOW_ARGS
285    pop         rbp
286    ret
287
288
289;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
290;                           short *qcoeff_ptr,short *dequant_ptr,
291;                           short *scan_mask, short *round_ptr,
292;                           short *quant_ptr, short *dqcoeff_ptr);
293global sym(vp8_fast_quantize_b_impl_sse)
294sym(vp8_fast_quantize_b_impl_sse):
295    push        rbp
296    mov         rbp, rsp
297    SHADOW_ARGS_TO_STACK 8
298    push rsi
299    push rdi
300    ; end prolog
301
302
303        mov             rsi,        arg(0) ;coeff_ptr
304        movdqa          xmm0,       [rsi]
305
306        mov             rax,        arg(1) ;zbin_ptr
307        movdqa          xmm1,       [rax]
308
309        movdqa          xmm3,       xmm0
310        psraw           xmm0,       15
311
312        pxor            xmm3,       xmm0
313        psubw           xmm3,       xmm0            ; abs
314
315        movdqa          xmm2,       xmm3
316        pcmpgtw         xmm1,       xmm2
317
318        pandn           xmm1,       xmm2
319        movdqa          xmm3,       xmm1
320
321        mov             rdx,        arg(6) ; quant_ptr
322        movdqa          xmm1,       [rdx]
323
324        mov             rcx,        arg(5) ; round_ptr
325        movdqa          xmm2,       [rcx]
326
327        paddw           xmm3,       xmm2
328        pmulhuw         xmm3,       xmm1
329
330        pxor            xmm3,       xmm0
331        psubw           xmm3,       xmm0        ;gain the sign back
332
333        mov             rdi,        arg(2) ;qcoeff_ptr
334        movdqa          xmm0,       xmm3
335
336        movdqa          [rdi],      xmm3
337
338        mov             rax,        arg(3) ;dequant_ptr
339        movdqa          xmm2,       [rax]
340
341        pmullw          xmm3,       xmm2
342        mov             rax,        arg(7) ;dqcoeff_ptr
343
344        movdqa          [rax],      xmm3
345
346        ; next 8
347        movdqa          xmm4,       [rsi+16]
348
349        mov             rax,        arg(1) ;zbin_ptr
350        movdqa          xmm5,       [rax+16]
351
352        movdqa          xmm7,       xmm4
353        psraw           xmm4,       15
354
355        pxor            xmm7,       xmm4
356        psubw           xmm7,       xmm4            ; abs
357
358        movdqa          xmm6,       xmm7
359        pcmpgtw         xmm5,       xmm6
360
361        pandn           xmm5,       xmm6
362        movdqa          xmm7,       xmm5
363
364        movdqa          xmm5,       [rdx+16]
365        movdqa          xmm6,       [rcx+16]
366
367
368        paddw           xmm7,       xmm6
369        pmulhuw         xmm7,       xmm5
370
371        pxor            xmm7,       xmm4
372        psubw           xmm7,       xmm4;gain the sign back
373
374        mov             rdi,        arg(2) ;qcoeff_ptr
375
376        movdqa          xmm1,       xmm7
377        movdqa          [rdi+16],   xmm7
378
379        mov             rax,        arg(3) ;dequant_ptr
380        movdqa          xmm6,       [rax+16]
381
382        pmullw          xmm7,       xmm6
383        mov             rax,        arg(7) ;dqcoeff_ptr
384
385        movdqa          [rax+16],   xmm7
386        mov             rdi,        arg(4) ;scan_mask
387
388        pxor            xmm7,       xmm7
389        movdqa          xmm2,       [rdi]
390
391        movdqa          xmm3,       [rdi+16];
392        pcmpeqw         xmm0,       xmm7
393
394        pcmpeqw         xmm1,       xmm7
395        pcmpeqw         xmm6,       xmm6
396
397        pxor            xmm0,       xmm6
398        pxor            xmm1,       xmm6
399
400        psrlw           xmm0,       15
401        psrlw           xmm1,       15
402
403        pmaddwd         xmm0,       xmm2
404        pmaddwd         xmm1,       xmm3
405
406        movq            xmm2,       xmm0
407        movq            xmm3,       xmm1
408
409        psrldq          xmm0,       8
410        psrldq          xmm1,       8
411
412        paddd           xmm0,       xmm1
413        paddd           xmm2,       xmm3
414
415        paddd           xmm0,       xmm2
416        movq            xmm1,       xmm0
417
418        psrldq          xmm0,       4
419        paddd           xmm1,       xmm0
420
421        movd            rcx,        xmm1
422        and             rcx,        0xffff
423
424        xor             rdx,        rdx
425        sub             rdx,        rcx
426
427        bsr             rax,        rcx
428        inc             rax
429
430        sar             rdx,        31
431        and             rax,        rdx
432
433
434    ; begin epilog
435    pop rdi
436    pop rsi
437    UNSHADOW_ARGS
438    pop         rbp
439    ret
440