1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%define private_prefix vp9
12
13%include "third_party/x86inc/x86inc.asm"
14
15SECTION_RODATA
16pw_1: times 8 dw 1
17
18SECTION .text
19
20%macro QUANTIZE_FP 2
21cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
22                                shift, qcoeff, dqcoeff, dequant, \
23                                eob, scan, iscan
24  cmp                    dword skipm, 0
25  jne .blank
26
27  ; actual quantize loop - setup pointers, rounders, etc.
28  movifnidn                   coeffq, coeffmp
29  movifnidn                  ncoeffq, ncoeffmp
30  mov                             r2, dequantmp
31  movifnidn                    zbinq, zbinmp
32  movifnidn                   roundq, roundmp
33  movifnidn                   quantq, quantmp
34  mova                            m1, [roundq]             ; m1 = round
35  mova                            m2, [quantq]             ; m2 = quant
36%ifidn %1, fp_32x32
37  pcmpeqw                         m5, m5
38  psrlw                           m5, 15
39  paddw                           m1, m5
40  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
41%endif
42  mova                            m3, [r2q]                ; m3 = dequant
43  mov                             r3, qcoeffmp
44  mov                             r4, dqcoeffmp
45  mov                             r5, iscanmp
46%ifidn %1, fp_32x32
47  psllw                           m2, 1
48%endif
49  pxor                            m5, m5                   ; m5 = dedicated zero
50
51  lea                         coeffq, [  coeffq+ncoeffq*2]
52  lea                            r5q, [  r5q+ncoeffq*2]
53  lea                            r3q, [ r3q+ncoeffq*2]
54  lea                            r4q, [r4q+ncoeffq*2]
55  neg                        ncoeffq
56
57  ; get DC and first 15 AC coeffs
58  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
59  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
60  pabsw                           m6, m9                   ; m6 = abs(m9)
61  pabsw                          m11, m10                  ; m11 = abs(m10)
62  pcmpeqw                         m7, m7
63
64  paddsw                          m6, m1                   ; m6 += round
65  punpckhqdq                      m1, m1
66  paddsw                         m11, m1                   ; m11 += round
67  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
68  punpckhqdq                      m2, m2
69  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
70  psignw                          m8, m9                   ; m8 = reinsert sign
71  psignw                         m13, m10                  ; m13 = reinsert sign
72  mova            [r3q+ncoeffq*2+ 0], m8
73  mova            [r3q+ncoeffq*2+16], m13
74%ifidn %1, fp_32x32
75  pabsw                           m8, m8
76  pabsw                          m13, m13
77%endif
78  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
79  punpckhqdq                      m3, m3
80  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
81%ifidn %1, fp_32x32
82  psrlw                           m8, 1
83  psrlw                          m13, 1
84  psignw                          m8, m9
85  psignw                         m13, m10
86  psrlw                           m0, m3, 2
87%else
88  psrlw                           m0, m3, 1
89%endif
90  mova            [r4q+ncoeffq*2+ 0], m8
91  mova            [r4q+ncoeffq*2+16], m13
92  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
93  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
94  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
95  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
96  psubw                           m6, m7                   ; m6 = scan[i] + 1
97  psubw                          m11, m7                   ; m11 = scan[i] + 1
98  pandn                           m8, m6                   ; m8 = max(eob)
99  pandn                          m13, m11                  ; m13 = max(eob)
100  pmaxsw                          m8, m13
101  add                        ncoeffq, mmsize
102  jz .accumulate_eob
103
104.ac_only_loop:
105  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
106  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
107  pabsw                           m6, m9                   ; m6 = abs(m9)
108  pabsw                          m11, m10                  ; m11 = abs(m10)
109
110  pcmpgtw                         m7, m6,  m0
111  pcmpgtw                        m12, m11, m0
112  pmovmskb                       r6d, m7
113  pmovmskb                       r2d, m12
114
115  or                              r6, r2
116  jz .skip_iter
117
118  pcmpeqw                         m7, m7
119
120  paddsw                          m6, m1                   ; m6 += round
121  paddsw                         m11, m1                   ; m11 += round
122  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
123  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
124  psignw                         m14, m9                   ; m14 = reinsert sign
125  psignw                         m13, m10                  ; m13 = reinsert sign
126  mova            [r3q+ncoeffq*2+ 0], m14
127  mova            [r3q+ncoeffq*2+16], m13
128%ifidn %1, fp_32x32
129  pabsw                          m14, m14
130  pabsw                          m13, m13
131%endif
132  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
133  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
134%ifidn %1, fp_32x32
135  psrlw                          m14, 1
136  psrlw                          m13, 1
137  psignw                         m14, m9
138  psignw                         m13, m10
139%endif
140  mova            [r4q+ncoeffq*2+ 0], m14
141  mova            [r4q+ncoeffq*2+16], m13
142  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
143  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
144  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
145  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
146  psubw                           m6, m7                   ; m6 = scan[i] + 1
147  psubw                          m11, m7                   ; m11 = scan[i] + 1
148  pandn                          m14, m6                   ; m14 = max(eob)
149  pandn                          m13, m11                  ; m13 = max(eob)
150  pmaxsw                          m8, m14
151  pmaxsw                          m8, m13
152  add                        ncoeffq, mmsize
153  jl .ac_only_loop
154
155  jmp .accumulate_eob
156.skip_iter:
157  mova            [r3q+ncoeffq*2+ 0], m5
158  mova            [r3q+ncoeffq*2+16], m5
159  mova            [r4q+ncoeffq*2+ 0], m5
160  mova            [r4q+ncoeffq*2+16], m5
161  add                        ncoeffq, mmsize
162  jl .ac_only_loop
163
164.accumulate_eob:
165  ; horizontally accumulate/max eobs and write into [eob] memory pointer
166  mov                             r2, eobmp
167  pshufd                          m7, m8, 0xe
168  pmaxsw                          m8, m7
169  pshuflw                         m7, m8, 0xe
170  pmaxsw                          m8, m7
171  pshuflw                         m7, m8, 0x1
172  pmaxsw                          m8, m7
173  pextrw                          r6, m8, 0
174  mov                           [r2], r6
175  RET
176
177  ; skip-block, i.e. just write all zeroes
178.blank:
179  mov                             r0, dqcoeffmp
180  movifnidn                  ncoeffq, ncoeffmp
181  mov                             r2, qcoeffmp
182  mov                             r3, eobmp
183
184  lea                            r0q, [r0q+ncoeffq*2]
185  lea                            r2q, [r2q+ncoeffq*2]
186  neg                        ncoeffq
187  pxor                            m7, m7
188.blank_loop:
189  mova            [r0q+ncoeffq*2+ 0], m7
190  mova            [r0q+ncoeffq*2+16], m7
191  mova            [r2q+ncoeffq*2+ 0], m7
192  mova            [r2q+ncoeffq*2+16], m7
193  add                        ncoeffq, mmsize
194  jl .blank_loop
195  mov                     word [r3q], 0
196  RET
197%endmacro
198
199INIT_XMM ssse3
200QUANTIZE_FP fp, 7
201QUANTIZE_FP fp_32x32, 7
202