1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_1: times 8 dw 1
15
16SECTION .text
17
18%macro QUANTIZE_FN 2
19cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
20                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
21                                eob, scan, iscan
22  cmp                    dword skipm, 0
23  jne .blank
24
25  ; actual quantize loop - setup pointers, rounders, etc.
26  movifnidn                   coeffq, coeffmp
27  movifnidn                  ncoeffq, ncoeffmp
28  mov                             r2, dequantmp
29  movifnidn                    zbinq, zbinmp
30  movifnidn                   roundq, roundmp
31  movifnidn                   quantq, quantmp
32  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
33  mova                            m0, [zbinq]              ; m0 = zbin
34  punpcklwd                       m4, m4
35  mova                            m1, [roundq]             ; m1 = round
36  pshufd                          m4, m4, 0
37  mova                            m2, [quantq]             ; m2 = quant
38  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
39%ifidn %1, b_32x32
40  pcmpeqw                         m5, m5
41  psrlw                           m5, 15
42  paddw                           m0, m5
43  paddw                           m1, m5
44  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
45  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
46%endif
47  mova                            m3, [r2q]                ; m3 = dequant
48  psubw                           m0, [pw_1]
49  mov                             r2, shiftmp
50  mov                             r3, qcoeffmp
51  mova                            m4, [r2]                 ; m4 = shift
52  mov                             r4, dqcoeffmp
53  mov                             r5, iscanmp
54%ifidn %1, b_32x32
55  psllw                           m4, 1
56%endif
57  pxor                            m5, m5                   ; m5 = dedicated zero
58  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
59  lea                         coeffq, [  coeffq+ncoeffq*2]
60  lea                         iscanq, [  iscanq+ncoeffq*2]
61  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
62  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
63  neg                        ncoeffq
64
65  ; get DC and first 15 AC coeffs
66  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
67  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
68  pabsw                           m6, m9                   ; m6 = abs(m9)
69  pabsw                          m11, m10                  ; m11 = abs(m10)
70  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
71  punpckhqdq                      m0, m0
72  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
73  paddsw                          m6, m1                   ; m6 += round
74  punpckhqdq                      m1, m1
75  paddsw                         m11, m1                   ; m11 += round
76  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
77  punpckhqdq                      m2, m2
78  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
79  paddw                           m8, m6                   ; m8 += m6
80  paddw                          m13, m11                  ; m13 += m11
81  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
82  punpckhqdq                      m4, m4
83  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
84  psignw                          m8, m9                   ; m8 = reinsert sign
85  psignw                         m13, m10                  ; m13 = reinsert sign
86  pand                            m8, m7
87  pand                           m13, m12
88  mova        [qcoeffq+ncoeffq*2+ 0], m8
89  mova        [qcoeffq+ncoeffq*2+16], m13
90%ifidn %1, b_32x32
91  pabsw                           m8, m8
92  pabsw                          m13, m13
93%endif
94  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
95  punpckhqdq                      m3, m3
96  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
97%ifidn %1, b_32x32
98  psrlw                           m8, 1
99  psrlw                          m13, 1
100  psignw                          m8, m9
101  psignw                         m13, m10
102%endif
103  mova       [dqcoeffq+ncoeffq*2+ 0], m8
104  mova       [dqcoeffq+ncoeffq*2+16], m13
105  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
106  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
107  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
108  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
109  psubw                           m6, m7                   ; m6 = scan[i] + 1
110  psubw                          m11, m12                  ; m11 = scan[i] + 1
111  pandn                           m8, m6                   ; m8 = max(eob)
112  pandn                          m13, m11                  ; m13 = max(eob)
113  pmaxsw                          m8, m13
114  add                        ncoeffq, mmsize
115  jz .accumulate_eob
116
117.ac_only_loop:
118  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
119  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
120  pabsw                           m6, m9                   ; m6 = abs(m9)
121  pabsw                          m11, m10                  ; m11 = abs(m10)
122  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
123  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
124%ifidn %1, b_32x32
125  pmovmskb                        r6, m7
126  pmovmskb                        r2, m12
127  or                              r6, r2
128  jz .skip_iter
129%endif
130  paddsw                          m6, m1                   ; m6 += round
131  paddsw                         m11, m1                   ; m11 += round
132  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
133  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
134  paddw                          m14, m6                   ; m14 += m6
135  paddw                          m13, m11                  ; m13 += m11
136  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
137  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
138  psignw                         m14, m9                   ; m14 = reinsert sign
139  psignw                         m13, m10                  ; m13 = reinsert sign
140  pand                           m14, m7
141  pand                           m13, m12
142  mova        [qcoeffq+ncoeffq*2+ 0], m14
143  mova        [qcoeffq+ncoeffq*2+16], m13
144%ifidn %1, b_32x32
145  pabsw                          m14, m14
146  pabsw                          m13, m13
147%endif
148  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
149  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
150%ifidn %1, b_32x32
151  psrlw                          m14, 1
152  psrlw                          m13, 1
153  psignw                         m14, m9
154  psignw                         m13, m10
155%endif
156  mova       [dqcoeffq+ncoeffq*2+ 0], m14
157  mova       [dqcoeffq+ncoeffq*2+16], m13
158  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
159  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
160  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
161  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
162  psubw                           m6, m7                   ; m6 = scan[i] + 1
163  psubw                          m11, m12                  ; m11 = scan[i] + 1
164  pandn                          m14, m6                   ; m14 = max(eob)
165  pandn                          m13, m11                  ; m13 = max(eob)
166  pmaxsw                          m8, m14
167  pmaxsw                          m8, m13
168  add                        ncoeffq, mmsize
169  jl .ac_only_loop
170
171%ifidn %1, b_32x32
172  jmp .accumulate_eob
173.skip_iter:
174  mova        [qcoeffq+ncoeffq*2+ 0], m5
175  mova        [qcoeffq+ncoeffq*2+16], m5
176  mova       [dqcoeffq+ncoeffq*2+ 0], m5
177  mova       [dqcoeffq+ncoeffq*2+16], m5
178  add                        ncoeffq, mmsize
179  jl .ac_only_loop
180%endif
181
182.accumulate_eob:
183  ; horizontally accumulate/max eobs and write into [eob] memory pointer
184  mov                             r2, eobmp
185  pshufd                          m7, m8, 0xe
186  pmaxsw                          m8, m7
187  pshuflw                         m7, m8, 0xe
188  pmaxsw                          m8, m7
189  pshuflw                         m7, m8, 0x1
190  pmaxsw                          m8, m7
191  pextrw                          r6, m8, 0
192  mov                             [r2], r6
193  RET
194
195  ; skip-block, i.e. just write all zeroes
196.blank:
197  mov                             r0, dqcoeffmp
198  movifnidn                  ncoeffq, ncoeffmp
199  mov                             r2, qcoeffmp
200  mov                             r3, eobmp
201  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
202  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
203  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
204  neg                        ncoeffq
205  pxor                            m7, m7
206.blank_loop:
207  mova       [dqcoeffq+ncoeffq*2+ 0], m7
208  mova       [dqcoeffq+ncoeffq*2+16], m7
209  mova        [qcoeffq+ncoeffq*2+ 0], m7
210  mova        [qcoeffq+ncoeffq*2+16], m7
211  add                        ncoeffq, mmsize
212  jl .blank_loop
213  mov                    word [eobq], 0
214  RET
215%endmacro
216
217INIT_XMM ssse3
218QUANTIZE_FN b, 7
219QUANTIZE_FN b_32x32, 7
220
221%macro QUANTIZE_FP 2
222cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
223                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
224                                eob, scan, iscan
225  cmp                    dword skipm, 0
226  jne .blank
227
228  ; actual quantize loop - setup pointers, rounders, etc.
229  movifnidn                   coeffq, coeffmp
230  movifnidn                  ncoeffq, ncoeffmp
231  mov                             r2, dequantmp
232  movifnidn                    zbinq, zbinmp
233  movifnidn                   roundq, roundmp
234  movifnidn                   quantq, quantmp
235  mova                            m1, [roundq]             ; m1 = round
236  mova                            m2, [quantq]             ; m2 = quant
237%ifidn %1, fp_32x32
238  pcmpeqw                         m5, m5
239  psrlw                           m5, 15
240  paddw                           m1, m5
241  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
242%endif
243  mova                            m3, [r2q]                ; m3 = dequant
244  mov                             r3, qcoeffmp
245  mov                             r4, dqcoeffmp
246  mov                             r5, iscanmp
247%ifidn %1, fp_32x32
248  psllw                           m2, 1
249%endif
250  pxor                            m5, m5                   ; m5 = dedicated zero
251  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
252  lea                         coeffq, [  coeffq+ncoeffq*2]
253  lea                         iscanq, [  iscanq+ncoeffq*2]
254  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
255  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
256  neg                        ncoeffq
257
258  ; get DC and first 15 AC coeffs
259  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
260  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
261  pabsw                           m6, m9                   ; m6 = abs(m9)
262  pabsw                          m11, m10                  ; m11 = abs(m10)
263  pcmpeqw                         m7, m7
264
265  paddsw                          m6, m1                   ; m6 += round
266  punpckhqdq                      m1, m1
267  paddsw                         m11, m1                   ; m11 += round
268  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
269  punpckhqdq                      m2, m2
270  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
271  psignw                          m8, m9                   ; m8 = reinsert sign
272  psignw                         m13, m10                  ; m13 = reinsert sign
273  mova        [qcoeffq+ncoeffq*2+ 0], m8
274  mova        [qcoeffq+ncoeffq*2+16], m13
275%ifidn %1, fp_32x32
276  pabsw                           m8, m8
277  pabsw                          m13, m13
278%endif
279  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
280  punpckhqdq                      m3, m3
281  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
282%ifidn %1, fp_32x32
283  psrlw                           m8, 1
284  psrlw                          m13, 1
285  psignw                          m8, m9
286  psignw                         m13, m10
287  psrlw                           m0, m3, 2
288%endif
289  mova       [dqcoeffq+ncoeffq*2+ 0], m8
290  mova       [dqcoeffq+ncoeffq*2+16], m13
291  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
292  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
293  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
294  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
295  psubw                           m6, m7                   ; m6 = scan[i] + 1
296  psubw                          m11, m7                   ; m11 = scan[i] + 1
297  pandn                           m8, m6                   ; m8 = max(eob)
298  pandn                          m13, m11                  ; m13 = max(eob)
299  pmaxsw                          m8, m13
300  add                        ncoeffq, mmsize
301  jz .accumulate_eob
302
303.ac_only_loop:
304  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
305  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
306  pabsw                           m6, m9                   ; m6 = abs(m9)
307  pabsw                          m11, m10                  ; m11 = abs(m10)
308%ifidn %1, fp_32x32
309  pcmpgtw                         m7, m6,  m0
310  pcmpgtw                        m12, m11, m0
311  pmovmskb                        r6, m7
312  pmovmskb                        r2, m12
313
314  or                              r6, r2
315  jz .skip_iter
316%endif
317  pcmpeqw                         m7, m7
318
319  paddsw                          m6, m1                   ; m6 += round
320  paddsw                         m11, m1                   ; m11 += round
321  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
322  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
323  psignw                         m14, m9                   ; m14 = reinsert sign
324  psignw                         m13, m10                  ; m13 = reinsert sign
325  mova        [qcoeffq+ncoeffq*2+ 0], m14
326  mova        [qcoeffq+ncoeffq*2+16], m13
327%ifidn %1, fp_32x32
328  pabsw                          m14, m14
329  pabsw                          m13, m13
330%endif
331  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
332  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
333%ifidn %1, fp_32x32
334  psrlw                          m14, 1
335  psrlw                          m13, 1
336  psignw                         m14, m9
337  psignw                         m13, m10
338%endif
339  mova       [dqcoeffq+ncoeffq*2+ 0], m14
340  mova       [dqcoeffq+ncoeffq*2+16], m13
341  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
342  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
343  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
344  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
345  psubw                           m6, m7                   ; m6 = scan[i] + 1
346  psubw                          m11, m7                   ; m11 = scan[i] + 1
347  pandn                          m14, m6                   ; m14 = max(eob)
348  pandn                          m13, m11                  ; m13 = max(eob)
349  pmaxsw                          m8, m14
350  pmaxsw                          m8, m13
351  add                        ncoeffq, mmsize
352  jl .ac_only_loop
353
354%ifidn %1, fp_32x32
355  jmp .accumulate_eob
356.skip_iter:
357  mova        [qcoeffq+ncoeffq*2+ 0], m5
358  mova        [qcoeffq+ncoeffq*2+16], m5
359  mova       [dqcoeffq+ncoeffq*2+ 0], m5
360  mova       [dqcoeffq+ncoeffq*2+16], m5
361  add                        ncoeffq, mmsize
362  jl .ac_only_loop
363%endif
364
365.accumulate_eob:
366  ; horizontally accumulate/max eobs and write into [eob] memory pointer
367  mov                             r2, eobmp
368  pshufd                          m7, m8, 0xe
369  pmaxsw                          m8, m7
370  pshuflw                         m7, m8, 0xe
371  pmaxsw                          m8, m7
372  pshuflw                         m7, m8, 0x1
373  pmaxsw                          m8, m7
374  pextrw                          r6, m8, 0
375  mov                             [r2], r6
376  RET
377
378  ; skip-block, i.e. just write all zeroes
379.blank:
380  mov                             r0, dqcoeffmp
381  movifnidn                  ncoeffq, ncoeffmp
382  mov                             r2, qcoeffmp
383  mov                             r3, eobmp
384  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
385  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
386  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
387  neg                        ncoeffq
388  pxor                            m7, m7
389.blank_loop:
390  mova       [dqcoeffq+ncoeffq*2+ 0], m7
391  mova       [dqcoeffq+ncoeffq*2+16], m7
392  mova        [qcoeffq+ncoeffq*2+ 0], m7
393  mova        [qcoeffq+ncoeffq*2+16], m7
394  add                        ncoeffq, mmsize
395  jl .blank_loop
396  mov                    word [eobq], 0
397  RET
398%endmacro
399
400INIT_XMM ssse3
401QUANTIZE_FP fp, 7
402QUANTIZE_FP fp_32x32, 7
403