1;
2;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_1: times 8 dw 1
15
16SECTION .text
17
18; TODO(yunqingwang)fix quantize_b code for skip=1 case.
19%macro QUANTIZE_FN 2
20cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
21                                shift, qcoeff, dqcoeff, dequant, \
22                                eob, scan, iscan
23  cmp                    dword skipm, 0
24  jne .blank
25
26  ; actual quantize loop - setup pointers, rounders, etc.
27  movifnidn                   coeffq, coeffmp
28  movifnidn                  ncoeffq, ncoeffmp
29  mov                             r2, dequantmp
30  movifnidn                    zbinq, zbinmp
31  movifnidn                   roundq, roundmp
32  movifnidn                   quantq, quantmp
33  mova                            m0, [zbinq]              ; m0 = zbin
34  mova                            m1, [roundq]             ; m1 = round
35  mova                            m2, [quantq]             ; m2 = quant
36%ifidn %1, b_32x32
37  pcmpeqw                         m5, m5
38  psrlw                           m5, 15
39  paddw                           m0, m5
40  paddw                           m1, m5
41  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
42  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
43%endif
44  mova                            m3, [r2q]                ; m3 = dequant
45  psubw                           m0, [pw_1]
46  mov                             r2, shiftmp
47  mov                             r3, qcoeffmp
48  mova                            m4, [r2]                 ; m4 = shift
49  mov                             r4, dqcoeffmp
50  mov                             r5, iscanmp
51%ifidn %1, b_32x32
52  psllw                           m4, 1
53%endif
54  pxor                            m5, m5                   ; m5 = dedicated zero
55  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
56%if CONFIG_VP9_HIGHBITDEPTH
57  lea                         coeffq, [  coeffq+ncoeffq*4]
58  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
59  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
60%else
61  lea                         coeffq, [  coeffq+ncoeffq*2]
62  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
63  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
64%endif
65  lea                         iscanq, [  iscanq+ncoeffq*2]
66  neg                        ncoeffq
67
68  ; get DC and first 15 AC coeffs
69%if CONFIG_VP9_HIGHBITDEPTH
70  ; coeff stored as 32bit numbers & require 16bit numbers
71  mova                            m9, [  coeffq+ncoeffq*4+ 0]
72  packssdw                        m9, [  coeffq+ncoeffq*4+16]
73  mova                           m10, [  coeffq+ncoeffq*4+32]
74  packssdw                       m10, [  coeffq+ncoeffq*4+48]
75%else
76  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
77  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
78%endif
79  pabsw                           m6, m9                   ; m6 = abs(m9)
80  pabsw                          m11, m10                  ; m11 = abs(m10)
81  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
82  punpckhqdq                      m0, m0
83  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
84  paddsw                          m6, m1                   ; m6 += round
85  punpckhqdq                      m1, m1
86  paddsw                         m11, m1                   ; m11 += round
87  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
88  punpckhqdq                      m2, m2
89  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
90  paddw                           m8, m6                   ; m8 += m6
91  paddw                          m13, m11                  ; m13 += m11
92  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
93  punpckhqdq                      m4, m4
94  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
95  psignw                          m8, m9                   ; m8 = reinsert sign
96  psignw                         m13, m10                  ; m13 = reinsert sign
97  pand                            m8, m7
98  pand                           m13, m12
99%if CONFIG_VP9_HIGHBITDEPTH
100  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
101  mova                           m11, m8
102  mova                            m6, m8
103  pcmpgtw                         m5, m8
104  punpcklwd                      m11, m5
105  punpckhwd                       m6, m5
106  mova        [qcoeffq+ncoeffq*4+ 0], m11
107  mova        [qcoeffq+ncoeffq*4+16], m6
108  pxor                            m5, m5
109  mova                           m11, m13
110  mova                            m6, m13
111  pcmpgtw                         m5, m13
112  punpcklwd                      m11, m5
113  punpckhwd                       m6, m5
114  mova        [qcoeffq+ncoeffq*4+32], m11
115  mova        [qcoeffq+ncoeffq*4+48], m6
116  pxor                            m5, m5             ; reset m5 to zero register
117%else
118  mova        [qcoeffq+ncoeffq*2+ 0], m8
119  mova        [qcoeffq+ncoeffq*2+16], m13
120%endif
121%ifidn %1, b_32x32
122  pabsw                           m8, m8
123  pabsw                          m13, m13
124%endif
125  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
126  punpckhqdq                      m3, m3
127  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
128%ifidn %1, b_32x32
129  psrlw                           m8, 1
130  psrlw                          m13, 1
131  psignw                          m8, m9
132  psignw                         m13, m10
133%endif
134%if CONFIG_VP9_HIGHBITDEPTH
135  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
136  mova                            m11, m8
137  mova                            m6, m8
138  pcmpgtw                         m5, m8
139  punpcklwd                      m11, m5
140  punpckhwd                       m6, m5
141  mova       [dqcoeffq+ncoeffq*4+ 0], m11
142  mova       [dqcoeffq+ncoeffq*4+16], m6
143  pxor                            m5, m5
144  mova                           m11, m13
145  mova                            m6, m13
146  pcmpgtw                         m5, m13
147  punpcklwd                      m11, m5
148  punpckhwd                       m6, m5
149  mova       [dqcoeffq+ncoeffq*4+32], m11
150  mova       [dqcoeffq+ncoeffq*4+48], m6
151  pxor                            m5, m5             ; reset m5 to zero register
152%else
153  mova       [dqcoeffq+ncoeffq*2+ 0], m8
154  mova       [dqcoeffq+ncoeffq*2+16], m13
155%endif
156  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
157  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
158  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
159  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
160  psubw                           m6, m7                   ; m6 = scan[i] + 1
161  psubw                          m11, m12                  ; m11 = scan[i] + 1
162  pandn                           m8, m6                   ; m8 = max(eob)
163  pandn                          m13, m11                  ; m13 = max(eob)
164  pmaxsw                          m8, m13
165  add                        ncoeffq, mmsize
166  jz .accumulate_eob
167
168.ac_only_loop:
169%if CONFIG_VP9_HIGHBITDEPTH
170  ; pack coeff from 32bit to 16bit array
171  mova                            m9, [  coeffq+ncoeffq*4+ 0]
172  packssdw                        m9, [  coeffq+ncoeffq*4+16]
173  mova                           m10, [  coeffq+ncoeffq*4+32]
174  packssdw                       m10, [  coeffq+ncoeffq*4+48]
175%else
176  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
177  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
178%endif
179  pabsw                           m6, m9                   ; m6 = abs(m9)
180  pabsw                          m11, m10                  ; m11 = abs(m10)
181  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
182  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
183%ifidn %1, b_32x32
184  pmovmskb                       r6d, m7
185  pmovmskb                       r2d, m12
186  or                              r6, r2
187  jz .skip_iter
188%endif
189  paddsw                          m6, m1                   ; m6 += round
190  paddsw                         m11, m1                   ; m11 += round
191  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
192  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
193  paddw                          m14, m6                   ; m14 += m6
194  paddw                          m13, m11                  ; m13 += m11
195  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
196  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
197  psignw                         m14, m9                   ; m14 = reinsert sign
198  psignw                         m13, m10                  ; m13 = reinsert sign
199  pand                           m14, m7
200  pand                           m13, m12
201%if CONFIG_VP9_HIGHBITDEPTH
202  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
203  pxor                           m11, m11
204  mova                           m11, m14
205  mova                            m6, m14
206  pcmpgtw                         m5, m14
207  punpcklwd                      m11, m5
208  punpckhwd                       m6, m5
209  mova        [qcoeffq+ncoeffq*4+ 0], m11
210  mova        [qcoeffq+ncoeffq*4+16], m6
211  pxor                            m5, m5
212  mova                           m11, m13
213  mova                            m6, m13
214  pcmpgtw                         m5, m13
215  punpcklwd                      m11, m5
216  punpckhwd                       m6, m5
217  mova        [qcoeffq+ncoeffq*4+32], m11
218  mova        [qcoeffq+ncoeffq*4+48], m6
219  pxor                            m5, m5             ; reset m5 to zero register
220%else
221  mova        [qcoeffq+ncoeffq*2+ 0], m14
222  mova        [qcoeffq+ncoeffq*2+16], m13
223%endif
224%ifidn %1, b_32x32
225  pabsw                          m14, m14
226  pabsw                          m13, m13
227%endif
228  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
229  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
230%ifidn %1, b_32x32
231  psrlw                          m14, 1
232  psrlw                          m13, 1
233  psignw                         m14, m9
234  psignw                         m13, m10
235%endif
236%if CONFIG_VP9_HIGHBITDEPTH
237  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
238  mova                           m11, m14
239  mova                            m6, m14
240  pcmpgtw                         m5, m14
241  punpcklwd                      m11, m5
242  punpckhwd                       m6, m5
243  mova       [dqcoeffq+ncoeffq*4+ 0], m11
244  mova       [dqcoeffq+ncoeffq*4+16], m6
245  pxor                            m5, m5
246  mova                           m11, m13
247  mova                            m6, m13
248  pcmpgtw                         m5, m13
249  punpcklwd                      m11, m5
250  punpckhwd                       m6, m5
251  mova       [dqcoeffq+ncoeffq*4+32], m11
252  mova       [dqcoeffq+ncoeffq*4+48], m6
253  pxor                            m5, m5
254%else
255  mova       [dqcoeffq+ncoeffq*2+ 0], m14
256  mova       [dqcoeffq+ncoeffq*2+16], m13
257%endif
258  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
259  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
260  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
261  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
262  psubw                           m6, m7                   ; m6 = scan[i] + 1
263  psubw                          m11, m12                  ; m11 = scan[i] + 1
264  pandn                          m14, m6                   ; m14 = max(eob)
265  pandn                          m13, m11                  ; m13 = max(eob)
266  pmaxsw                          m8, m14
267  pmaxsw                          m8, m13
268  add                        ncoeffq, mmsize
269  jl .ac_only_loop
270
271%ifidn %1, b_32x32
272  jmp .accumulate_eob
273.skip_iter:
274%if CONFIG_VP9_HIGHBITDEPTH
275  mova        [qcoeffq+ncoeffq*4+ 0], m5
276  mova        [qcoeffq+ncoeffq*4+16], m5
277  mova        [qcoeffq+ncoeffq*4+32], m5
278  mova        [qcoeffq+ncoeffq*4+48], m5
279  mova       [dqcoeffq+ncoeffq*4+ 0], m5
280  mova       [dqcoeffq+ncoeffq*4+16], m5
281  mova       [dqcoeffq+ncoeffq*4+32], m5
282  mova       [dqcoeffq+ncoeffq*4+48], m5
283%else
284  mova        [qcoeffq+ncoeffq*2+ 0], m5
285  mova        [qcoeffq+ncoeffq*2+16], m5
286  mova       [dqcoeffq+ncoeffq*2+ 0], m5
287  mova       [dqcoeffq+ncoeffq*2+16], m5
288%endif
289  add                        ncoeffq, mmsize
290  jl .ac_only_loop
291%endif
292
293.accumulate_eob:
294  ; horizontally accumulate/max eobs and write into [eob] memory pointer
295  mov                             r2, eobmp
296  pshufd                          m7, m8, 0xe
297  pmaxsw                          m8, m7
298  pshuflw                         m7, m8, 0xe
299  pmaxsw                          m8, m7
300  pshuflw                         m7, m8, 0x1
301  pmaxsw                          m8, m7
302  pextrw                          r6, m8, 0
303  mov                             [r2], r6
304  RET
305
306  ; skip-block, i.e. just write all zeroes
307.blank:
308  mov                             r0, dqcoeffmp
309  movifnidn                  ncoeffq, ncoeffmp
310  mov                             r2, qcoeffmp
311  mov                             r3, eobmp
312  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
313%if CONFIG_VP9_HIGHBITDEPTH
314  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
315  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
316%else
317  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
318  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
319%endif
320  neg                        ncoeffq
321  pxor                            m7, m7
322.blank_loop:
323%if CONFIG_VP9_HIGHBITDEPTH
324  mova       [dqcoeffq+ncoeffq*4+ 0], m7
325  mova       [dqcoeffq+ncoeffq*4+16], m7
326  mova       [dqcoeffq+ncoeffq*4+32], m7
327  mova       [dqcoeffq+ncoeffq*4+48], m7
328  mova        [qcoeffq+ncoeffq*4+ 0], m7
329  mova        [qcoeffq+ncoeffq*4+16], m7
330  mova        [qcoeffq+ncoeffq*4+32], m7
331  mova        [qcoeffq+ncoeffq*4+48], m7
332%else
333  mova       [dqcoeffq+ncoeffq*2+ 0], m7
334  mova       [dqcoeffq+ncoeffq*2+16], m7
335  mova        [qcoeffq+ncoeffq*2+ 0], m7
336  mova        [qcoeffq+ncoeffq*2+16], m7
337%endif
338  add                        ncoeffq, mmsize
339  jl .blank_loop
340  mov                    word [eobq], 0
341  RET
342%endmacro
343
344INIT_XMM ssse3
345QUANTIZE_FN b, 7
346QUANTIZE_FN b_32x32, 7
347