1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license and patent
5;  grant that can be found in the LICENSE file in the root of the source
6;  tree. All contributing project authors may be found in the AUTHORS
7;  file in the root of the source tree.
8;
9
10
11%include "vpx_ports/x86_abi_support.asm"
12%include "asm_enc_offsets.asm"
13
14
15; void vp8_regular_quantize_b_sse2 | arg
16;  (BLOCK  *b,                     |  0
17;   BLOCKD *d)                     |  1
18
19global sym(vp8_regular_quantize_b_sse2)
20sym(vp8_regular_quantize_b_sse2):
21    push        rbp
22    mov         rbp, rsp
23    SAVE_XMM
24    GET_GOT     rbx
25    push        rsi
26
27%if ABI_IS_32BIT
28    push        rdi
29%else
30  %ifidn __OUTPUT_FORMAT__,x64
31    push        rdi
32  %endif
33%endif
34
35    ALIGN_STACK 16, rax
36    %define BLOCKD_d          0  ;  8
37    %define zrun_zbin_boost   8  ;  8
38    %define abs_minus_zbin    16 ; 32
39    %define temp_qcoeff       48 ; 32
40    %define qcoeff            80 ; 32
41    %define stack_size        112
42    sub         rsp, stack_size
43    ; end prolog
44
45%if ABI_IS_32BIT
46    mov         rdi, arg(0)
47%else
48  %ifidn __OUTPUT_FORMAT__,x64
49    mov         rdi, rcx                    ; BLOCK *b
50    mov         [rsp + BLOCKD_d], rdx
51  %else
52    ;mov         rdi, rdi                    ; BLOCK *b
53    mov         [rsp + BLOCKD_d], rsi
54  %endif
55%endif
56
57    mov         rdx, [rdi + vp8_block_coeff] ; coeff_ptr
58    mov         rcx, [rdi + vp8_block_zbin] ; zbin_ptr
59    movd        xmm7, [rdi + vp8_block_zbin_extra] ; zbin_oq_value
60
61    ; z
62    movdqa      xmm0, [rdx]
63    movdqa      xmm4, [rdx + 16]
64    mov         rdx, [rdi + vp8_block_round] ; round_ptr
65
66    pshuflw     xmm7, xmm7, 0
67    punpcklwd   xmm7, xmm7                  ; duplicated zbin_oq_value
68
69    movdqa      xmm1, xmm0
70    movdqa      xmm5, xmm4
71
72    ; sz
73    psraw       xmm0, 15
74    psraw       xmm4, 15
75
76    ; (z ^ sz)
77    pxor        xmm1, xmm0
78    pxor        xmm5, xmm4
79
80    ; x = abs(z)
81    psubw       xmm1, xmm0
82    psubw       xmm5, xmm4
83
84    movdqa      xmm2, [rcx]
85    movdqa      xmm3, [rcx + 16]
86    mov         rcx, [rdi + vp8_block_quant] ; quant_ptr
87
88    ; *zbin_ptr + zbin_oq_value
89    paddw       xmm2, xmm7
90    paddw       xmm3, xmm7
91
92    ; x - (*zbin_ptr + zbin_oq_value)
93    psubw       xmm1, xmm2
94    psubw       xmm5, xmm3
95    movdqa      [rsp + abs_minus_zbin], xmm1
96    movdqa      [rsp + abs_minus_zbin + 16], xmm5
97
98    ; add (zbin_ptr + zbin_oq_value) back
99    paddw       xmm1, xmm2
100    paddw       xmm5, xmm3
101
102    movdqa      xmm2, [rdx]
103    movdqa      xmm6, [rdx + 16]
104
105    movdqa      xmm3, [rcx]
106    movdqa      xmm7, [rcx + 16]
107
108    ; x + round
109    paddw       xmm1, xmm2
110    paddw       xmm5, xmm6
111
112    ; y = x * quant_ptr >> 16
113    pmulhw      xmm3, xmm1
114    pmulhw      xmm7, xmm5
115
116    ; y += x
117    paddw       xmm1, xmm3
118    paddw       xmm5, xmm7
119
120    movdqa      [rsp + temp_qcoeff], xmm1
121    movdqa      [rsp + temp_qcoeff + 16], xmm5
122
123    pxor        xmm6, xmm6
124    ; zero qcoeff
125    movdqa      [rsp + qcoeff], xmm6
126    movdqa      [rsp + qcoeff + 16], xmm6
127
128    mov         rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
129    mov         rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
130    mov         [rsp + zrun_zbin_boost], rsi
131
132%macro ZIGZAG_LOOP 1
133    movsx       edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc
134
135    ; x
136    movsx       ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
137
138    ; if (x >= zbin)
139    sub         cx, WORD PTR[rsi]           ; x - zbin
140    lea         rsi, [rsi + 2]              ; zbin_boost_ptr++
141    jl          rq_zigzag_loop_%1           ; x < zbin
142
143    movsx       edi, WORD PTR[rsp + temp_qcoeff + rdx *2]
144
145    ; downshift by quant_shift[rdx]
146    movsx       ecx, WORD PTR[rax + rdx*2]  ; quant_shift_ptr[rc]
147    sar         edi, cl                     ; also sets Z bit
148    je          rq_zigzag_loop_%1           ; !y
149    mov         WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
150    mov         rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
151rq_zigzag_loop_%1:
152%endmacro
153ZIGZAG_LOOP 0
154ZIGZAG_LOOP 1
155ZIGZAG_LOOP 2
156ZIGZAG_LOOP 3
157ZIGZAG_LOOP 4
158ZIGZAG_LOOP 5
159ZIGZAG_LOOP 6
160ZIGZAG_LOOP 7
161ZIGZAG_LOOP 8
162ZIGZAG_LOOP 9
163ZIGZAG_LOOP 10
164ZIGZAG_LOOP 11
165ZIGZAG_LOOP 12
166ZIGZAG_LOOP 13
167ZIGZAG_LOOP 14
168ZIGZAG_LOOP 15
169
170    movdqa      xmm2, [rsp + qcoeff]
171    movdqa      xmm3, [rsp + qcoeff + 16]
172
173%if ABI_IS_32BIT
174    mov         rdi, arg(1)
175%else
176    mov         rdi, [rsp + BLOCKD_d]
177%endif
178
179    mov         rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr
180    mov         rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
181
182    ; y ^ sz
183    pxor        xmm2, xmm0
184    pxor        xmm3, xmm4
185    ; x = (y ^ sz) - sz
186    psubw       xmm2, xmm0
187    psubw       xmm3, xmm4
188
189    ; dequant
190    movdqa      xmm0, [rcx]
191    movdqa      xmm1, [rcx + 16]
192
193    mov         rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr
194
195    pmullw      xmm0, xmm2
196    pmullw      xmm1, xmm3
197
198    movdqa      [rcx], xmm2        ; store qcoeff
199    movdqa      [rcx + 16], xmm3
200    movdqa      [rsi], xmm0        ; store dqcoeff
201    movdqa      [rsi + 16], xmm1
202
203    ; select the last value (in zig_zag order) for EOB
204    pcmpeqw     xmm2, xmm6
205    pcmpeqw     xmm3, xmm6
206    ; !
207    pcmpeqw     xmm6, xmm6
208    pxor        xmm2, xmm6
209    pxor        xmm3, xmm6
210    ; mask inv_zig_zag
211    pand        xmm2, [GLOBAL(inv_zig_zag)]
212    pand        xmm3, [GLOBAL(inv_zig_zag + 16)]
213    ; select the max value
214    pmaxsw      xmm2, xmm3
215    pshufd      xmm3, xmm2, 00001110b
216    pmaxsw      xmm2, xmm3
217    pshuflw     xmm3, xmm2, 00001110b
218    pmaxsw      xmm2, xmm3
219    pshuflw     xmm3, xmm2, 00000001b
220    pmaxsw      xmm2, xmm3
221    movd        eax, xmm2
222    and         eax, 0xff
223    mov         [rdi + vp8_blockd_eob], eax
224
225    ; begin epilog
226    add         rsp, stack_size
227    pop         rsp
228%if ABI_IS_32BIT
229    pop         rdi
230%else
231  %ifidn __OUTPUT_FORMAT__,x64
232    pop         rdi
233  %endif
234%endif
235    pop         rsi
236    RESTORE_GOT
237    RESTORE_XMM
238    pop         rbp
239    ret
240
241; int vp8_fast_quantize_b_impl_sse2 | arg
242;  (short *coeff_ptr,               |  0
243;   short *qcoeff_ptr,              |  1
244;   short *dequant_ptr,             |  2
245;   short *inv_scan_order,          |  3
246;   short *round_ptr,               |  4
247;   short *quant_ptr,               |  5
248;   short *dqcoeff_ptr)             |  6
249
250global sym(vp8_fast_quantize_b_impl_sse2)
251sym(vp8_fast_quantize_b_impl_sse2):
252    push        rbp
253    mov         rbp, rsp
254    SHADOW_ARGS_TO_STACK 7
255    push        rsi
256    push        rdi
257    ; end prolog
258
259    mov         rdx, arg(0)                 ;coeff_ptr
260    mov         rcx, arg(2)                 ;dequant_ptr
261    mov         rdi, arg(4)                 ;round_ptr
262    mov         rsi, arg(5)                 ;quant_ptr
263
264    movdqa      xmm0, XMMWORD PTR[rdx]
265    movdqa      xmm4, XMMWORD PTR[rdx + 16]
266
267    movdqa      xmm2, XMMWORD PTR[rdi]      ;round lo
268    movdqa      xmm3, XMMWORD PTR[rdi + 16] ;round hi
269
270    movdqa      xmm1, xmm0
271    movdqa      xmm5, xmm4
272
273    psraw       xmm0, 15                    ;sign of z (aka sz)
274    psraw       xmm4, 15                    ;sign of z (aka sz)
275
276    pxor        xmm1, xmm0
277    pxor        xmm5, xmm4
278    psubw       xmm1, xmm0                  ;x = abs(z)
279    psubw       xmm5, xmm4                  ;x = abs(z)
280
281    paddw       xmm1, xmm2
282    paddw       xmm5, xmm3
283
284    pmulhw      xmm1, XMMWORD PTR[rsi]
285    pmulhw      xmm5, XMMWORD PTR[rsi + 16]
286
287    mov         rdi, arg(1)                 ;qcoeff_ptr
288    mov         rsi, arg(6)                 ;dqcoeff_ptr
289
290    movdqa      xmm2, XMMWORD PTR[rcx]
291    movdqa      xmm3, XMMWORD PTR[rcx + 16]
292
293    pxor        xmm1, xmm0
294    pxor        xmm5, xmm4
295    psubw       xmm1, xmm0
296    psubw       xmm5, xmm4
297
298    movdqa      XMMWORD PTR[rdi], xmm1
299    movdqa      XMMWORD PTR[rdi + 16], xmm5
300
301    pmullw      xmm2, xmm1
302    pmullw      xmm3, xmm5
303
304    mov         rdi, arg(3)                 ;inv_scan_order
305
306    ; Start with 16
307    pxor        xmm4, xmm4                  ;clear all bits
308    pcmpeqw     xmm1, xmm4
309    pcmpeqw     xmm5, xmm4
310
311    pcmpeqw     xmm4, xmm4                  ;set all bits
312    pxor        xmm1, xmm4
313    pxor        xmm5, xmm4
314
315    pand        xmm1, XMMWORD PTR[rdi]
316    pand        xmm5, XMMWORD PTR[rdi+16]
317
318    pmaxsw      xmm1, xmm5
319
320    ; now down to 8
321    pshufd      xmm5, xmm1, 00001110b
322
323    pmaxsw      xmm1, xmm5
324
325    ; only 4 left
326    pshuflw     xmm5, xmm1, 00001110b
327
328    pmaxsw      xmm1, xmm5
329
330    ; okay, just 2!
331    pshuflw     xmm5, xmm1, 00000001b
332
333    pmaxsw      xmm1, xmm5
334
335    movd        rax, xmm1
336    and         rax, 0xff
337
338    movdqa      XMMWORD PTR[rsi], xmm2        ;store dqcoeff
339    movdqa      XMMWORD PTR[rsi + 16], xmm3   ;store dqcoeff
340
341    ; begin epilog
342    pop         rdi
343    pop         rsi
344    UNSHADOW_ARGS
345    pop         rbp
346    ret
347
348SECTION_RODATA
349align 16
350zig_zag:
351  dw 0x0000, 0x0001, 0x0004, 0x0008
352  dw 0x0005, 0x0002, 0x0003, 0x0006
353  dw 0x0009, 0x000c, 0x000d, 0x000a
354  dw 0x0007, 0x000b, 0x000e, 0x000f
355inv_zig_zag:
356  dw 0x0001, 0x0002, 0x0006, 0x0007
357  dw 0x0003, 0x0005, 0x0008, 0x000d
358  dw 0x0004, 0x0009, 0x000c, 0x000e
359  dw 0x000a, 0x000b, 0x000f, 0x0010
360