1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license and patent
5;  grant that can be found in the LICENSE file in the root of the source
6;  tree. All contributing project authors may be found in the AUTHORS
7;  file in the root of the source tree.
8;
9
10
11%include "vpx_ports/x86_abi_support.asm"
12%include "vp8_asm_enc_offsets.asm"
13
14
15; void vp8_regular_quantize_b_sse4 | arg
16;  (BLOCK  *b,                     |  0
17;   BLOCKD *d)                     |  1
18
19global sym(vp8_regular_quantize_b_sse4) PRIVATE
20sym(vp8_regular_quantize_b_sse4):
21
22%if ABI_IS_32BIT
23    push        rbp
24    mov         rbp, rsp
25    GET_GOT     rbx
26    push        rdi
27    push        rsi
28
29    ALIGN_STACK 16, rax
30    %define qcoeff      0 ; 32
31    %define stack_size 32
32    sub         rsp, stack_size
33%else
34  %if LIBVPX_YASM_WIN64
35    SAVE_XMM 8, u
36    push        rdi
37    push        rsi
38  %endif
39%endif
40    ; end prolog
41
42%if ABI_IS_32BIT
43    mov         rdi, arg(0)                 ; BLOCK *b
44    mov         rsi, arg(1)                 ; BLOCKD *d
45%else
46  %if LIBVPX_YASM_WIN64
47    mov         rdi, rcx                    ; BLOCK *b
48    mov         rsi, rdx                    ; BLOCKD *d
49  %else
50    ;mov         rdi, rdi                    ; BLOCK *b
51    ;mov         rsi, rsi                    ; BLOCKD *d
52  %endif
53%endif
54
55    mov         rax, [rdi + vp8_block_coeff]
56    mov         rcx, [rdi + vp8_block_zbin]
57    mov         rdx, [rdi + vp8_block_round]
58    movd        xmm7, [rdi + vp8_block_zbin_extra]
59
60    ; z
61    movdqa      xmm0, [rax]
62    movdqa      xmm1, [rax + 16]
63
64    ; duplicate zbin_oq_value
65    pshuflw     xmm7, xmm7, 0
66    punpcklwd   xmm7, xmm7
67
68    movdqa      xmm2, xmm0
69    movdqa      xmm3, xmm1
70
71    ; sz
72    psraw       xmm0, 15
73    psraw       xmm1, 15
74
75    ; (z ^ sz)
76    pxor        xmm2, xmm0
77    pxor        xmm3, xmm1
78
79    ; x = abs(z)
80    psubw       xmm2, xmm0
81    psubw       xmm3, xmm1
82
83    ; zbin
84    movdqa      xmm4, [rcx]
85    movdqa      xmm5, [rcx + 16]
86
87    ; *zbin_ptr + zbin_oq_value
88    paddw       xmm4, xmm7
89    paddw       xmm5, xmm7
90
91    movdqa      xmm6, xmm2
92    movdqa      xmm7, xmm3
93
94    ; x - (*zbin_ptr + zbin_oq_value)
95    psubw       xmm6, xmm4
96    psubw       xmm7, xmm5
97
98    ; round
99    movdqa      xmm4, [rdx]
100    movdqa      xmm5, [rdx + 16]
101
102    mov         rax, [rdi + vp8_block_quant_shift]
103    mov         rcx, [rdi + vp8_block_quant]
104    mov         rdx, [rdi + vp8_block_zrun_zbin_boost]
105
106    ; x + round
107    paddw       xmm2, xmm4
108    paddw       xmm3, xmm5
109
110    ; quant
111    movdqa      xmm4, [rcx]
112    movdqa      xmm5, [rcx + 16]
113
114    ; y = x * quant_ptr >> 16
115    pmulhw      xmm4, xmm2
116    pmulhw      xmm5, xmm3
117
118    ; y += x
119    paddw       xmm2, xmm4
120    paddw       xmm3, xmm5
121
122    pxor        xmm4, xmm4
123%if ABI_IS_32BIT
124    movdqa      [rsp + qcoeff], xmm4
125    movdqa      [rsp + qcoeff + 16], xmm4
126%else
127    pxor        xmm8, xmm8
128%endif
129
130    ; quant_shift
131    movdqa      xmm5, [rax]
132
133    ; zrun_zbin_boost
134    mov         rax, rdx
135
136%macro ZIGZAG_LOOP 5
137    ; x
138    pextrw      ecx, %4, %2
139
140    ; if (x >= zbin)
141    sub         cx, WORD PTR[rdx]           ; x - zbin
142    lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
143    jl          .rq_zigzag_loop_%1          ; x < zbin
144
145    pextrw      edi, %3, %2                 ; y
146
147    ; downshift by quant_shift[rc]
148    pextrb      ecx, xmm5, %1               ; quant_shift[rc]
149    sar         edi, cl                     ; also sets Z bit
150    je          .rq_zigzag_loop_%1          ; !y
151%if ABI_IS_32BIT
152    mov         WORD PTR[rsp + qcoeff + %1 *2], di
153%else
154    pinsrw      %5, edi, %2                 ; qcoeff[rc]
155%endif
156    mov         rdx, rax                    ; reset to b->zrun_zbin_boost
157.rq_zigzag_loop_%1:
158%endmacro
159; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
160ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
161ZIGZAG_LOOP  1, 1, xmm2, xmm6, xmm4
162ZIGZAG_LOOP  4, 4, xmm2, xmm6, xmm4
163ZIGZAG_LOOP  8, 0, xmm3, xmm7, xmm8
164ZIGZAG_LOOP  5, 5, xmm2, xmm6, xmm4
165ZIGZAG_LOOP  2, 2, xmm2, xmm6, xmm4
166ZIGZAG_LOOP  3, 3, xmm2, xmm6, xmm4
167ZIGZAG_LOOP  6, 6, xmm2, xmm6, xmm4
168ZIGZAG_LOOP  9, 1, xmm3, xmm7, xmm8
169ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
170ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
171ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
172ZIGZAG_LOOP  7, 7, xmm2, xmm6, xmm4
173ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
174ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
175ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
176
177    mov         rcx, [rsi + vp8_blockd_dequant]
178    mov         rdi, [rsi + vp8_blockd_dqcoeff]
179
180%if ABI_IS_32BIT
181    movdqa      xmm4, [rsp + qcoeff]
182    movdqa      xmm5, [rsp + qcoeff + 16]
183%else
184    %define     xmm5 xmm8
185%endif
186
187    ; y ^ sz
188    pxor        xmm4, xmm0
189    pxor        xmm5, xmm1
190    ; x = (y ^ sz) - sz
191    psubw       xmm4, xmm0
192    psubw       xmm5, xmm1
193
194    ; dequant
195    movdqa      xmm0, [rcx]
196    movdqa      xmm1, [rcx + 16]
197
198    mov         rcx, [rsi + vp8_blockd_qcoeff]
199
200    pmullw      xmm0, xmm4
201    pmullw      xmm1, xmm5
202
203    ; store qcoeff
204    movdqa      [rcx], xmm4
205    movdqa      [rcx + 16], xmm5
206
207    ; store dqcoeff
208    movdqa      [rdi], xmm0
209    movdqa      [rdi + 16], xmm1
210
211    mov         rcx, [rsi + vp8_blockd_eob]
212
213    ; select the last value (in zig_zag order) for EOB
214    pxor        xmm6, xmm6
215    pcmpeqw     xmm4, xmm6
216    pcmpeqw     xmm5, xmm6
217
218    packsswb    xmm4, xmm5
219    pshufb      xmm4, [GLOBAL(zig_zag1d)]
220    pmovmskb    edx, xmm4
221    xor         rdi, rdi
222    mov         eax, -1
223    xor         dx, ax
224    bsr         eax, edx
225    sub         edi, edx
226    sar         edi, 31
227    add         eax, 1
228    and         eax, edi
229
230    mov         BYTE PTR [rcx], al          ; store eob
231
232    ; begin epilog
233%if ABI_IS_32BIT
234    add         rsp, stack_size
235    pop         rsp
236
237    pop         rsi
238    pop         rdi
239    RESTORE_GOT
240    pop         rbp
241%else
242  %undef xmm5
243  %if LIBVPX_YASM_WIN64
244    pop         rsi
245    pop         rdi
246    RESTORE_XMM
247  %endif
248%endif
249
250    ret
251
252SECTION_RODATA
253align 16
254; vp8/common/entropy.c: vp8_default_zig_zag1d
255zig_zag1d:
256    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
257