quantize_ssse3_x86_64.asm revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1;
2;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION_RODATA
14pw_1: times 8 dw 1
15
16SECTION .text
17
18; TODO(yunqingwang)fix quantize_b code for skip=1 case.
19%macro QUANTIZE_FN 2
20cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
21                                shift, qcoeff, dqcoeff, dequant, \
22                                eob, scan, iscan
23  cmp                    dword skipm, 0
24  jne .blank
25
26  ; actual quantize loop - setup pointers, rounders, etc.
27  movifnidn                   coeffq, coeffmp
28  movifnidn                  ncoeffq, ncoeffmp
29  mov                             r2, dequantmp
30  movifnidn                    zbinq, zbinmp
31  movifnidn                   roundq, roundmp
32  movifnidn                   quantq, quantmp
33  mova                            m0, [zbinq]              ; m0 = zbin
34  mova                            m1, [roundq]             ; m1 = round
35  mova                            m2, [quantq]             ; m2 = quant
36%ifidn %1, b_32x32
37  pcmpeqw                         m5, m5
38  psrlw                           m5, 15
39  paddw                           m0, m5
40  paddw                           m1, m5
41  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
42  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
43%endif
44  mova                            m3, [r2q]                ; m3 = dequant
45  psubw                           m0, [pw_1]
46  mov                             r2, shiftmp
47  mov                             r3, qcoeffmp
48  mova                            m4, [r2]                 ; m4 = shift
49  mov                             r4, dqcoeffmp
50  mov                             r5, iscanmp
51%ifidn %1, b_32x32
52  psllw                           m4, 1
53%endif
54  pxor                            m5, m5                   ; m5 = dedicated zero
55  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
56  lea                         coeffq, [  coeffq+ncoeffq*2]
57  lea                         iscanq, [  iscanq+ncoeffq*2]
58  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
59  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
60  neg                        ncoeffq
61
62  ; get DC and first 15 AC coeffs
63  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
64  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
65  pabsw                           m6, m9                   ; m6 = abs(m9)
66  pabsw                          m11, m10                  ; m11 = abs(m10)
67  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
68  punpckhqdq                      m0, m0
69  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
70  paddsw                          m6, m1                   ; m6 += round
71  punpckhqdq                      m1, m1
72  paddsw                         m11, m1                   ; m11 += round
73  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
74  punpckhqdq                      m2, m2
75  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
76  paddw                           m8, m6                   ; m8 += m6
77  paddw                          m13, m11                  ; m13 += m11
78  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
79  punpckhqdq                      m4, m4
80  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
81  psignw                          m8, m9                   ; m8 = reinsert sign
82  psignw                         m13, m10                  ; m13 = reinsert sign
83  pand                            m8, m7
84  pand                           m13, m12
85  mova        [qcoeffq+ncoeffq*2+ 0], m8
86  mova        [qcoeffq+ncoeffq*2+16], m13
87%ifidn %1, b_32x32
88  pabsw                           m8, m8
89  pabsw                          m13, m13
90%endif
91  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
92  punpckhqdq                      m3, m3
93  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
94%ifidn %1, b_32x32
95  psrlw                           m8, 1
96  psrlw                          m13, 1
97  psignw                          m8, m9
98  psignw                         m13, m10
99%endif
100  mova       [dqcoeffq+ncoeffq*2+ 0], m8
101  mova       [dqcoeffq+ncoeffq*2+16], m13
102  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
103  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
104  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
105  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
106  psubw                           m6, m7                   ; m6 = scan[i] + 1
107  psubw                          m11, m12                  ; m11 = scan[i] + 1
108  pandn                           m8, m6                   ; m8 = max(eob)
109  pandn                          m13, m11                  ; m13 = max(eob)
110  pmaxsw                          m8, m13
111  add                        ncoeffq, mmsize
112  jz .accumulate_eob
113
114.ac_only_loop:
115  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
116  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
117  pabsw                           m6, m9                   ; m6 = abs(m9)
118  pabsw                          m11, m10                  ; m11 = abs(m10)
119  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
120  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
121%ifidn %1, b_32x32
122  pmovmskb                       r6d, m7
123  pmovmskb                       r2d, m12
124  or                              r6, r2
125  jz .skip_iter
126%endif
127  paddsw                          m6, m1                   ; m6 += round
128  paddsw                         m11, m1                   ; m11 += round
129  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
130  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
131  paddw                          m14, m6                   ; m14 += m6
132  paddw                          m13, m11                  ; m13 += m11
133  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
134  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
135  psignw                         m14, m9                   ; m14 = reinsert sign
136  psignw                         m13, m10                  ; m13 = reinsert sign
137  pand                           m14, m7
138  pand                           m13, m12
139  mova        [qcoeffq+ncoeffq*2+ 0], m14
140  mova        [qcoeffq+ncoeffq*2+16], m13
141%ifidn %1, b_32x32
142  pabsw                          m14, m14
143  pabsw                          m13, m13
144%endif
145  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
146  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
147%ifidn %1, b_32x32
148  psrlw                          m14, 1
149  psrlw                          m13, 1
150  psignw                         m14, m9
151  psignw                         m13, m10
152%endif
153  mova       [dqcoeffq+ncoeffq*2+ 0], m14
154  mova       [dqcoeffq+ncoeffq*2+16], m13
155  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
156  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
157  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
158  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
159  psubw                           m6, m7                   ; m6 = scan[i] + 1
160  psubw                          m11, m12                  ; m11 = scan[i] + 1
161  pandn                          m14, m6                   ; m14 = max(eob)
162  pandn                          m13, m11                  ; m13 = max(eob)
163  pmaxsw                          m8, m14
164  pmaxsw                          m8, m13
165  add                        ncoeffq, mmsize
166  jl .ac_only_loop
167
168%ifidn %1, b_32x32
169  jmp .accumulate_eob
170.skip_iter:
171  mova        [qcoeffq+ncoeffq*2+ 0], m5
172  mova        [qcoeffq+ncoeffq*2+16], m5
173  mova       [dqcoeffq+ncoeffq*2+ 0], m5
174  mova       [dqcoeffq+ncoeffq*2+16], m5
175  add                        ncoeffq, mmsize
176  jl .ac_only_loop
177%endif
178
179.accumulate_eob:
180  ; horizontally accumulate/max eobs and write into [eob] memory pointer
181  mov                             r2, eobmp
182  pshufd                          m7, m8, 0xe
183  pmaxsw                          m8, m7
184  pshuflw                         m7, m8, 0xe
185  pmaxsw                          m8, m7
186  pshuflw                         m7, m8, 0x1
187  pmaxsw                          m8, m7
188  pextrw                          r6, m8, 0
189  mov                             [r2], r6
190  RET
191
192  ; skip-block, i.e. just write all zeroes
193.blank:
194  mov                             r0, dqcoeffmp
195  movifnidn                  ncoeffq, ncoeffmp
196  mov                             r2, qcoeffmp
197  mov                             r3, eobmp
198  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
199  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
200  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
201  neg                        ncoeffq
202  pxor                            m7, m7
203.blank_loop:
204  mova       [dqcoeffq+ncoeffq*2+ 0], m7
205  mova       [dqcoeffq+ncoeffq*2+16], m7
206  mova        [qcoeffq+ncoeffq*2+ 0], m7
207  mova        [qcoeffq+ncoeffq*2+16], m7
208  add                        ncoeffq, mmsize
209  jl .blank_loop
210  mov                    word [eobq], 0
211  RET
212%endmacro
213
214INIT_XMM ssse3
215QUANTIZE_FN b, 7
216QUANTIZE_FN b_32x32, 7
217