1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_1: times 8 dw 1 15 16SECTION .text 17 18%macro QUANTIZE_FN 2 19cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 20 shift, qcoeff, dqcoeff, dequant, zbin_oq, \ 21 eob, scan, iscan 22 cmp dword skipm, 0 23 jne .blank 24 25 ; actual quantize loop - setup pointers, rounders, etc. 26 movifnidn coeffq, coeffmp 27 movifnidn ncoeffq, ncoeffmp 28 mov r2, dequantmp 29 movifnidn zbinq, zbinmp 30 movifnidn roundq, roundmp 31 movifnidn quantq, quantmp 32 movd m4, dword zbin_oqm ; m4 = zbin_oq 33 mova m0, [zbinq] ; m0 = zbin 34 punpcklwd m4, m4 35 mova m1, [roundq] ; m1 = round 36 pshufd m4, m4, 0 37 mova m2, [quantq] ; m2 = quant 38 paddw m0, m4 ; m0 = zbin + zbin_oq 39%ifidn %1, b_32x32 40 pcmpeqw m5, m5 41 psrlw m5, 15 42 paddw m0, m5 43 paddw m1, m5 44 psrlw m0, 1 ; m0 = (m0 + 1) / 2 45 psrlw m1, 1 ; m1 = (m1 + 1) / 2 46%endif 47 mova m3, [r2q] ; m3 = dequant 48 psubw m0, [pw_1] 49 mov r2, shiftmp 50 mov r3, qcoeffmp 51 mova m4, [r2] ; m4 = shift 52 mov r4, dqcoeffmp 53 mov r5, iscanmp 54%ifidn %1, b_32x32 55 psllw m4, 1 56%endif 57 pxor m5, m5 ; m5 = dedicated zero 58 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob 59 lea coeffq, [ coeffq+ncoeffq*2] 60 lea iscanq, [ iscanq+ncoeffq*2] 61 lea qcoeffq, [ qcoeffq+ncoeffq*2] 62 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 63 neg ncoeffq 64 65 ; get DC and first 15 AC coeffs 66 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 67 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 68 pabsw m6, m9 ; m6 = abs(m9) 69 pabsw m11, m10 ; m11 = abs(m10) 70 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 71 punpckhqdq m0, m0 72 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 73 paddsw m6, m1 ; m6 += round 74 punpckhqdq m1, m1 75 paddsw m11, m1 ; m11 += round 76 pmulhw m8, m6, m2 ; m8 = m6*q>>16 77 punpckhqdq m2, m2 78 pmulhw m13, m11, m2 ; m13 = m11*q>>16 79 paddw m8, m6 ; m8 += m6 80 paddw m13, m11 ; m13 += m11 81 pmulhw m8, m4 ; m8 = m8*qsh>>16 82 punpckhqdq m4, m4 83 pmulhw m13, m4 ; m13 = m13*qsh>>16 84 psignw m8, m9 ; m8 = reinsert sign 85 psignw m13, m10 ; m13 = reinsert sign 86 pand m8, m7 87 pand m13, m12 88 mova [qcoeffq+ncoeffq*2+ 0], m8 89 mova [qcoeffq+ncoeffq*2+16], m13 90%ifidn %1, b_32x32 91 pabsw m8, m8 92 pabsw m13, m13 93%endif 94 pmullw m8, m3 ; dqc[i] = qc[i] * q 95 punpckhqdq m3, m3 96 pmullw m13, m3 ; dqc[i] = qc[i] * q 97%ifidn %1, b_32x32 98 psrlw m8, 1 99 psrlw m13, 1 100 psignw m8, m9 101 psignw m13, m10 102%endif 103 mova [dqcoeffq+ncoeffq*2+ 0], m8 104 mova [dqcoeffq+ncoeffq*2+16], m13 105 pcmpeqw m8, m5 ; m8 = c[i] == 0 106 pcmpeqw m13, m5 ; m13 = c[i] == 0 107 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 108 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 109 psubw m6, m7 ; m6 = scan[i] + 1 110 psubw m11, m12 ; m11 = scan[i] + 1 111 pandn m8, m6 ; m8 = max(eob) 112 pandn m13, m11 ; m13 = max(eob) 113 pmaxsw m8, m13 114 add ncoeffq, mmsize 115 jz .accumulate_eob 116 117.ac_only_loop: 118 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 119 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 120 pabsw m6, m9 ; m6 = abs(m9) 121 pabsw m11, m10 ; m11 = abs(m10) 122 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 123 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 124%ifidn %1, b_32x32 125 pmovmskb r6, m7 126 pmovmskb r2, m12 127 or r6, r2 128 jz .skip_iter 129%endif 130 paddsw m6, m1 ; m6 += round 131 paddsw m11, m1 ; m11 += round 132 pmulhw m14, m6, m2 ; m14 = m6*q>>16 133 pmulhw m13, m11, m2 ; m13 = m11*q>>16 134 paddw m14, m6 ; m14 += m6 135 paddw m13, m11 ; m13 += m11 136 pmulhw m14, m4 ; m14 = m14*qsh>>16 137 pmulhw m13, m4 ; m13 = m13*qsh>>16 138 psignw m14, m9 ; m14 = reinsert sign 139 psignw m13, m10 ; m13 = reinsert sign 140 pand m14, m7 141 pand m13, m12 142 mova [qcoeffq+ncoeffq*2+ 0], m14 143 mova [qcoeffq+ncoeffq*2+16], m13 144%ifidn %1, b_32x32 145 pabsw m14, m14 146 pabsw m13, m13 147%endif 148 pmullw m14, m3 ; dqc[i] = qc[i] * q 149 pmullw m13, m3 ; dqc[i] = qc[i] * q 150%ifidn %1, b_32x32 151 psrlw m14, 1 152 psrlw m13, 1 153 psignw m14, m9 154 psignw m13, m10 155%endif 156 mova [dqcoeffq+ncoeffq*2+ 0], m14 157 mova [dqcoeffq+ncoeffq*2+16], m13 158 pcmpeqw m14, m5 ; m14 = c[i] == 0 159 pcmpeqw m13, m5 ; m13 = c[i] == 0 160 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 161 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 162 psubw m6, m7 ; m6 = scan[i] + 1 163 psubw m11, m12 ; m11 = scan[i] + 1 164 pandn m14, m6 ; m14 = max(eob) 165 pandn m13, m11 ; m13 = max(eob) 166 pmaxsw m8, m14 167 pmaxsw m8, m13 168 add ncoeffq, mmsize 169 jl .ac_only_loop 170 171%ifidn %1, b_32x32 172 jmp .accumulate_eob 173.skip_iter: 174 mova [qcoeffq+ncoeffq*2+ 0], m5 175 mova [qcoeffq+ncoeffq*2+16], m5 176 mova [dqcoeffq+ncoeffq*2+ 0], m5 177 mova [dqcoeffq+ncoeffq*2+16], m5 178 add ncoeffq, mmsize 179 jl .ac_only_loop 180%endif 181 182.accumulate_eob: 183 ; horizontally accumulate/max eobs and write into [eob] memory pointer 184 mov r2, eobmp 185 pshufd m7, m8, 0xe 186 pmaxsw m8, m7 187 pshuflw m7, m8, 0xe 188 pmaxsw m8, m7 189 pshuflw m7, m8, 0x1 190 pmaxsw m8, m7 191 pextrw [r2], m8, 0 192 RET 193 194 ; skip-block, i.e. just write all zeroes 195.blank: 196 mov r0, dqcoeffmp 197 movifnidn ncoeffq, ncoeffmp 198 mov r2, qcoeffmp 199 mov r3, eobmp 200 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 201 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 202 lea qcoeffq, [ qcoeffq+ncoeffq*2] 203 neg ncoeffq 204 pxor m7, m7 205.blank_loop: 206 mova [dqcoeffq+ncoeffq*2+ 0], m7 207 mova [dqcoeffq+ncoeffq*2+16], m7 208 mova [qcoeffq+ncoeffq*2+ 0], m7 209 mova [qcoeffq+ncoeffq*2+16], m7 210 add ncoeffq, mmsize 211 jl .blank_loop 212 mov word [eobq], 0 213 RET 214%endmacro 215 216INIT_XMM ssse3 217QUANTIZE_FN b, 6 218QUANTIZE_FN b_32x32, 7 219