1; 2; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_1: times 8 dw 1 15 16SECTION .text 17 18; TODO(yunqingwang)fix quantize_b code for skip=1 case. 19%macro QUANTIZE_FN 2 20cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 21 shift, qcoeff, dqcoeff, dequant, \ 22 eob, scan, iscan 23 cmp dword skipm, 0 24 jne .blank 25 26 ; actual quantize loop - setup pointers, rounders, etc. 27 movifnidn coeffq, coeffmp 28 movifnidn ncoeffq, ncoeffmp 29 mov r2, dequantmp 30 movifnidn zbinq, zbinmp 31 movifnidn roundq, roundmp 32 movifnidn quantq, quantmp 33 mova m0, [zbinq] ; m0 = zbin 34 mova m1, [roundq] ; m1 = round 35 mova m2, [quantq] ; m2 = quant 36%ifidn %1, b_32x32 37 pcmpeqw m5, m5 38 psrlw m5, 15 39 paddw m0, m5 40 paddw m1, m5 41 psrlw m0, 1 ; m0 = (m0 + 1) / 2 42 psrlw m1, 1 ; m1 = (m1 + 1) / 2 43%endif 44 mova m3, [r2q] ; m3 = dequant 45 psubw m0, [pw_1] 46 mov r2, shiftmp 47 mov r3, qcoeffmp 48 mova m4, [r2] ; m4 = shift 49 mov r4, dqcoeffmp 50 mov r5, iscanmp 51%ifidn %1, b_32x32 52 psllw m4, 1 53%endif 54 pxor m5, m5 ; m5 = dedicated zero 55 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob 56 lea coeffq, [ coeffq+ncoeffq*2] 57 lea iscanq, [ iscanq+ncoeffq*2] 58 lea qcoeffq, [ qcoeffq+ncoeffq*2] 59 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 60 neg ncoeffq 61 62 ; get DC and first 15 AC coeffs 63 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 64 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 65 pabsw m6, m9 ; m6 = abs(m9) 66 pabsw m11, m10 ; m11 = abs(m10) 67 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 68 punpckhqdq m0, m0 69 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 70 paddsw m6, m1 ; m6 += round 71 punpckhqdq m1, m1 72 paddsw m11, m1 ; m11 += round 73 pmulhw m8, m6, m2 ; m8 = m6*q>>16 74 punpckhqdq m2, m2 75 pmulhw m13, m11, m2 ; m13 = m11*q>>16 76 paddw m8, m6 ; m8 += m6 77 paddw m13, m11 ; m13 += m11 78 pmulhw m8, m4 ; m8 = m8*qsh>>16 79 punpckhqdq m4, m4 80 pmulhw m13, m4 ; m13 = m13*qsh>>16 81 psignw m8, m9 ; m8 = reinsert sign 82 psignw m13, m10 ; m13 = reinsert sign 83 pand m8, m7 84 pand m13, m12 85 mova [qcoeffq+ncoeffq*2+ 0], m8 86 mova [qcoeffq+ncoeffq*2+16], m13 87%ifidn %1, b_32x32 88 pabsw m8, m8 89 pabsw m13, m13 90%endif 91 pmullw m8, m3 ; dqc[i] = qc[i] * q 92 punpckhqdq m3, m3 93 pmullw m13, m3 ; dqc[i] = qc[i] * q 94%ifidn %1, b_32x32 95 psrlw m8, 1 96 psrlw m13, 1 97 psignw m8, m9 98 psignw m13, m10 99%endif 100 mova [dqcoeffq+ncoeffq*2+ 0], m8 101 mova [dqcoeffq+ncoeffq*2+16], m13 102 pcmpeqw m8, m5 ; m8 = c[i] == 0 103 pcmpeqw m13, m5 ; m13 = c[i] == 0 104 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 105 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 106 psubw m6, m7 ; m6 = scan[i] + 1 107 psubw m11, m12 ; m11 = scan[i] + 1 108 pandn m8, m6 ; m8 = max(eob) 109 pandn m13, m11 ; m13 = max(eob) 110 pmaxsw m8, m13 111 add ncoeffq, mmsize 112 jz .accumulate_eob 113 114.ac_only_loop: 115 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 116 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 117 pabsw m6, m9 ; m6 = abs(m9) 118 pabsw m11, m10 ; m11 = abs(m10) 119 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 120 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 121%ifidn %1, b_32x32 122 pmovmskb r6d, m7 123 pmovmskb r2d, m12 124 or r6, r2 125 jz .skip_iter 126%endif 127 paddsw m6, m1 ; m6 += round 128 paddsw m11, m1 ; m11 += round 129 pmulhw m14, m6, m2 ; m14 = m6*q>>16 130 pmulhw m13, m11, m2 ; m13 = m11*q>>16 131 paddw m14, m6 ; m14 += m6 132 paddw m13, m11 ; m13 += m11 133 pmulhw m14, m4 ; m14 = m14*qsh>>16 134 pmulhw m13, m4 ; m13 = m13*qsh>>16 135 psignw m14, m9 ; m14 = reinsert sign 136 psignw m13, m10 ; m13 = reinsert sign 137 pand m14, m7 138 pand m13, m12 139 mova [qcoeffq+ncoeffq*2+ 0], m14 140 mova [qcoeffq+ncoeffq*2+16], m13 141%ifidn %1, b_32x32 142 pabsw m14, m14 143 pabsw m13, m13 144%endif 145 pmullw m14, m3 ; dqc[i] = qc[i] * q 146 pmullw m13, m3 ; dqc[i] = qc[i] * q 147%ifidn %1, b_32x32 148 psrlw m14, 1 149 psrlw m13, 1 150 psignw m14, m9 151 psignw m13, m10 152%endif 153 mova [dqcoeffq+ncoeffq*2+ 0], m14 154 mova [dqcoeffq+ncoeffq*2+16], m13 155 pcmpeqw m14, m5 ; m14 = c[i] == 0 156 pcmpeqw m13, m5 ; m13 = c[i] == 0 157 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 158 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 159 psubw m6, m7 ; m6 = scan[i] + 1 160 psubw m11, m12 ; m11 = scan[i] + 1 161 pandn m14, m6 ; m14 = max(eob) 162 pandn m13, m11 ; m13 = max(eob) 163 pmaxsw m8, m14 164 pmaxsw m8, m13 165 add ncoeffq, mmsize 166 jl .ac_only_loop 167 168%ifidn %1, b_32x32 169 jmp .accumulate_eob 170.skip_iter: 171 mova [qcoeffq+ncoeffq*2+ 0], m5 172 mova [qcoeffq+ncoeffq*2+16], m5 173 mova [dqcoeffq+ncoeffq*2+ 0], m5 174 mova [dqcoeffq+ncoeffq*2+16], m5 175 add ncoeffq, mmsize 176 jl .ac_only_loop 177%endif 178 179.accumulate_eob: 180 ; horizontally accumulate/max eobs and write into [eob] memory pointer 181 mov r2, eobmp 182 pshufd m7, m8, 0xe 183 pmaxsw m8, m7 184 pshuflw m7, m8, 0xe 185 pmaxsw m8, m7 186 pshuflw m7, m8, 0x1 187 pmaxsw m8, m7 188 pextrw r6, m8, 0 189 mov [r2], r6 190 RET 191 192 ; skip-block, i.e. just write all zeroes 193.blank: 194 mov r0, dqcoeffmp 195 movifnidn ncoeffq, ncoeffmp 196 mov r2, qcoeffmp 197 mov r3, eobmp 198 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 199 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 200 lea qcoeffq, [ qcoeffq+ncoeffq*2] 201 neg ncoeffq 202 pxor m7, m7 203.blank_loop: 204 mova [dqcoeffq+ncoeffq*2+ 0], m7 205 mova [dqcoeffq+ncoeffq*2+16], m7 206 mova [qcoeffq+ncoeffq*2+ 0], m7 207 mova [qcoeffq+ncoeffq*2+16], m7 208 add ncoeffq, mmsize 209 jl .blank_loop 210 mov word [eobq], 0 211 RET 212%endmacro 213 214INIT_XMM ssse3 215QUANTIZE_FN b, 7 216QUANTIZE_FN b_32x32, 7 217