1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_1: times 8 dw 1
15
16SECTION .text
17
18%macro QUANTIZE_FN 2
19cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
20                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
21                                eob, scan, iscan
22  cmp                    dword skipm, 0
23  jne .blank
24
25  ; actual quantize loop - setup pointers, rounders, etc.
26  movifnidn                   coeffq, coeffmp
27  movifnidn                  ncoeffq, ncoeffmp
28  mov                             r2, dequantmp
29  movifnidn                    zbinq, zbinmp
30  movifnidn                   roundq, roundmp
31  movifnidn                   quantq, quantmp
32  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
33  mova                            m0, [zbinq]              ; m0 = zbin
34  punpcklwd                       m4, m4
35  mova                            m1, [roundq]             ; m1 = round
36  pshufd                          m4, m4, 0
37  mova                            m2, [quantq]             ; m2 = quant
38  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
39%ifidn %1, b_32x32
40  pcmpeqw                         m5, m5
41  psrlw                           m5, 15
42  paddw                           m0, m5
43  paddw                           m1, m5
44  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
45  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
46%endif
47  mova                            m3, [r2q]                ; m3 = dequant
48  psubw                           m0, [pw_1]
49  mov                             r2, shiftmp
50  mov                             r3, qcoeffmp
51  mova                            m4, [r2]                 ; m4 = shift
52  mov                             r4, dqcoeffmp
53  mov                             r5, iscanmp
54%ifidn %1, b_32x32
55  psllw                           m4, 1
56%endif
57  pxor                            m5, m5                   ; m5 = dedicated zero
58  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
59  lea                         coeffq, [  coeffq+ncoeffq*2]
60  lea                         iscanq, [  iscanq+ncoeffq*2]
61  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
62  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
63  neg                        ncoeffq
64
65  ; get DC and first 15 AC coeffs
66  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
67  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
68  pabsw                           m6, m9                   ; m6 = abs(m9)
69  pabsw                          m11, m10                  ; m11 = abs(m10)
70  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
71  punpckhqdq                      m0, m0
72  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
73  paddsw                          m6, m1                   ; m6 += round
74  punpckhqdq                      m1, m1
75  paddsw                         m11, m1                   ; m11 += round
76  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
77  punpckhqdq                      m2, m2
78  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
79  paddw                           m8, m6                   ; m8 += m6
80  paddw                          m13, m11                  ; m13 += m11
81  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
82  punpckhqdq                      m4, m4
83  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
84  psignw                          m8, m9                   ; m8 = reinsert sign
85  psignw                         m13, m10                  ; m13 = reinsert sign
86  pand                            m8, m7
87  pand                           m13, m12
88  mova        [qcoeffq+ncoeffq*2+ 0], m8
89  mova        [qcoeffq+ncoeffq*2+16], m13
90%ifidn %1, b_32x32
91  pabsw                           m8, m8
92  pabsw                          m13, m13
93%endif
94  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
95  punpckhqdq                      m3, m3
96  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
97%ifidn %1, b_32x32
98  psrlw                           m8, 1
99  psrlw                          m13, 1
100  psignw                          m8, m9
101  psignw                         m13, m10
102%endif
103  mova       [dqcoeffq+ncoeffq*2+ 0], m8
104  mova       [dqcoeffq+ncoeffq*2+16], m13
105  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
106  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
107  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
108  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
109  psubw                           m6, m7                   ; m6 = scan[i] + 1
110  psubw                          m11, m12                  ; m11 = scan[i] + 1
111  pandn                           m8, m6                   ; m8 = max(eob)
112  pandn                          m13, m11                  ; m13 = max(eob)
113  pmaxsw                          m8, m13
114  add                        ncoeffq, mmsize
115  jz .accumulate_eob
116
117.ac_only_loop:
118  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
119  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
120  pabsw                           m6, m9                   ; m6 = abs(m9)
121  pabsw                          m11, m10                  ; m11 = abs(m10)
122  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
123  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
124%ifidn %1, b_32x32
125  pmovmskb                        r6, m7
126  pmovmskb                        r2, m12
127  or                              r6, r2
128  jz .skip_iter
129%endif
130  paddsw                          m6, m1                   ; m6 += round
131  paddsw                         m11, m1                   ; m11 += round
132  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
133  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
134  paddw                          m14, m6                   ; m14 += m6
135  paddw                          m13, m11                  ; m13 += m11
136  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
137  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
138  psignw                         m14, m9                   ; m14 = reinsert sign
139  psignw                         m13, m10                  ; m13 = reinsert sign
140  pand                           m14, m7
141  pand                           m13, m12
142  mova        [qcoeffq+ncoeffq*2+ 0], m14
143  mova        [qcoeffq+ncoeffq*2+16], m13
144%ifidn %1, b_32x32
145  pabsw                          m14, m14
146  pabsw                          m13, m13
147%endif
148  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
149  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
150%ifidn %1, b_32x32
151  psrlw                          m14, 1
152  psrlw                          m13, 1
153  psignw                         m14, m9
154  psignw                         m13, m10
155%endif
156  mova       [dqcoeffq+ncoeffq*2+ 0], m14
157  mova       [dqcoeffq+ncoeffq*2+16], m13
158  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
159  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
160  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
161  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
162  psubw                           m6, m7                   ; m6 = scan[i] + 1
163  psubw                          m11, m12                  ; m11 = scan[i] + 1
164  pandn                          m14, m6                   ; m14 = max(eob)
165  pandn                          m13, m11                  ; m13 = max(eob)
166  pmaxsw                          m8, m14
167  pmaxsw                          m8, m13
168  add                        ncoeffq, mmsize
169  jl .ac_only_loop
170
171%ifidn %1, b_32x32
172  jmp .accumulate_eob
173.skip_iter:
174  mova        [qcoeffq+ncoeffq*2+ 0], m5
175  mova        [qcoeffq+ncoeffq*2+16], m5
176  mova       [dqcoeffq+ncoeffq*2+ 0], m5
177  mova       [dqcoeffq+ncoeffq*2+16], m5
178  add                        ncoeffq, mmsize
179  jl .ac_only_loop
180%endif
181
182.accumulate_eob:
183  ; horizontally accumulate/max eobs and write into [eob] memory pointer
184  mov                             r2, eobmp
185  pshufd                          m7, m8, 0xe
186  pmaxsw                          m8, m7
187  pshuflw                         m7, m8, 0xe
188  pmaxsw                          m8, m7
189  pshuflw                         m7, m8, 0x1
190  pmaxsw                          m8, m7
191  pextrw                        [r2], m8, 0
192  RET
193
194  ; skip-block, i.e. just write all zeroes
195.blank:
196  mov                             r0, dqcoeffmp
197  movifnidn                  ncoeffq, ncoeffmp
198  mov                             r2, qcoeffmp
199  mov                             r3, eobmp
200  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
201  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
202  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
203  neg                        ncoeffq
204  pxor                            m7, m7
205.blank_loop:
206  mova       [dqcoeffq+ncoeffq*2+ 0], m7
207  mova       [dqcoeffq+ncoeffq*2+16], m7
208  mova        [qcoeffq+ncoeffq*2+ 0], m7
209  mova        [qcoeffq+ncoeffq*2+16], m7
210  add                        ncoeffq, mmsize
211  jl .blank_loop
212  mov                    word [eobq], 0
213  RET
214%endmacro
215
216INIT_XMM ssse3
217QUANTIZE_FN b, 6
218QUANTIZE_FN b_32x32, 7
219