1; 2; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "third_party/x86inc/x86inc.asm" 12 13SECTION_RODATA 14pw_1: times 8 dw 1 15 16SECTION .text 17 18; TODO(yunqingwang)fix quantize_b code for skip=1 case. 19%macro QUANTIZE_FN 2 20cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 21 shift, qcoeff, dqcoeff, dequant, \ 22 eob, scan, iscan 23 cmp dword skipm, 0 24 jne .blank 25 26 ; actual quantize loop - setup pointers, rounders, etc. 27 movifnidn coeffq, coeffmp 28 movifnidn ncoeffq, ncoeffmp 29 mov r2, dequantmp 30 movifnidn zbinq, zbinmp 31 movifnidn roundq, roundmp 32 movifnidn quantq, quantmp 33 mova m0, [zbinq] ; m0 = zbin 34 mova m1, [roundq] ; m1 = round 35 mova m2, [quantq] ; m2 = quant 36%ifidn %1, b_32x32 37 pcmpeqw m5, m5 38 psrlw m5, 15 39 paddw m0, m5 40 paddw m1, m5 41 psrlw m0, 1 ; m0 = (m0 + 1) / 2 42 psrlw m1, 1 ; m1 = (m1 + 1) / 2 43%endif 44 mova m3, [r2q] ; m3 = dequant 45 psubw m0, [pw_1] 46 mov r2, shiftmp 47 mov r3, qcoeffmp 48 mova m4, [r2] ; m4 = shift 49 mov r4, dqcoeffmp 50 mov r5, iscanmp 51%ifidn %1, b_32x32 52 psllw m4, 1 53%endif 54 pxor m5, m5 ; m5 = dedicated zero 55 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob 56%if CONFIG_VP9_HIGHBITDEPTH 57 lea coeffq, [ coeffq+ncoeffq*4] 58 lea qcoeffq, [ qcoeffq+ncoeffq*4] 59 lea dqcoeffq, [dqcoeffq+ncoeffq*4] 60%else 61 lea coeffq, [ coeffq+ncoeffq*2] 62 lea qcoeffq, [ qcoeffq+ncoeffq*2] 63 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 64%endif 65 lea iscanq, [ iscanq+ncoeffq*2] 66 neg ncoeffq 67 68 ; get DC and first 15 AC coeffs 69%if CONFIG_VP9_HIGHBITDEPTH 70 ; coeff stored as 32bit numbers & require 16bit numbers 71 mova m9, [ coeffq+ncoeffq*4+ 0] 72 packssdw m9, [ coeffq+ncoeffq*4+16] 73 mova m10, [ coeffq+ncoeffq*4+32] 74 packssdw m10, [ coeffq+ncoeffq*4+48] 75%else 76 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 77 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 78%endif 79 pabsw m6, m9 ; m6 = abs(m9) 80 pabsw m11, m10 ; m11 = abs(m10) 81 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 82 punpckhqdq m0, m0 83 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 84 paddsw m6, m1 ; m6 += round 85 punpckhqdq m1, m1 86 paddsw m11, m1 ; m11 += round 87 pmulhw m8, m6, m2 ; m8 = m6*q>>16 88 punpckhqdq m2, m2 89 pmulhw m13, m11, m2 ; m13 = m11*q>>16 90 paddw m8, m6 ; m8 += m6 91 paddw m13, m11 ; m13 += m11 92 pmulhw m8, m4 ; m8 = m8*qsh>>16 93 punpckhqdq m4, m4 94 pmulhw m13, m4 ; m13 = m13*qsh>>16 95 psignw m8, m9 ; m8 = reinsert sign 96 psignw m13, m10 ; m13 = reinsert sign 97 pand m8, m7 98 pand m13, m12 99%if CONFIG_VP9_HIGHBITDEPTH 100 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 101 mova m11, m8 102 mova m6, m8 103 pcmpgtw m5, m8 104 punpcklwd m11, m5 105 punpckhwd m6, m5 106 mova [qcoeffq+ncoeffq*4+ 0], m11 107 mova [qcoeffq+ncoeffq*4+16], m6 108 pxor m5, m5 109 mova m11, m13 110 mova m6, m13 111 pcmpgtw m5, m13 112 punpcklwd m11, m5 113 punpckhwd m6, m5 114 mova [qcoeffq+ncoeffq*4+32], m11 115 mova [qcoeffq+ncoeffq*4+48], m6 116 pxor m5, m5 ; reset m5 to zero register 117%else 118 mova [qcoeffq+ncoeffq*2+ 0], m8 119 mova [qcoeffq+ncoeffq*2+16], m13 120%endif 121%ifidn %1, b_32x32 122 pabsw m8, m8 123 pabsw m13, m13 124%endif 125 pmullw m8, m3 ; dqc[i] = qc[i] * q 126 punpckhqdq m3, m3 127 pmullw m13, m3 ; dqc[i] = qc[i] * q 128%ifidn %1, b_32x32 129 psrlw m8, 1 130 psrlw m13, 1 131 psignw m8, m9 132 psignw m13, m10 133%endif 134%if CONFIG_VP9_HIGHBITDEPTH 135 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 136 mova m11, m8 137 mova m6, m8 138 pcmpgtw m5, m8 139 punpcklwd m11, m5 140 punpckhwd m6, m5 141 mova [dqcoeffq+ncoeffq*4+ 0], m11 142 mova [dqcoeffq+ncoeffq*4+16], m6 143 pxor m5, m5 144 mova m11, m13 145 mova m6, m13 146 pcmpgtw m5, m13 147 punpcklwd m11, m5 148 punpckhwd m6, m5 149 mova [dqcoeffq+ncoeffq*4+32], m11 150 mova [dqcoeffq+ncoeffq*4+48], m6 151 pxor m5, m5 ; reset m5 to zero register 152%else 153 mova [dqcoeffq+ncoeffq*2+ 0], m8 154 mova [dqcoeffq+ncoeffq*2+16], m13 155%endif 156 pcmpeqw m8, m5 ; m8 = c[i] == 0 157 pcmpeqw m13, m5 ; m13 = c[i] == 0 158 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 159 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 160 psubw m6, m7 ; m6 = scan[i] + 1 161 psubw m11, m12 ; m11 = scan[i] + 1 162 pandn m8, m6 ; m8 = max(eob) 163 pandn m13, m11 ; m13 = max(eob) 164 pmaxsw m8, m13 165 add ncoeffq, mmsize 166 jz .accumulate_eob 167 168.ac_only_loop: 169%if CONFIG_VP9_HIGHBITDEPTH 170 ; pack coeff from 32bit to 16bit array 171 mova m9, [ coeffq+ncoeffq*4+ 0] 172 packssdw m9, [ coeffq+ncoeffq*4+16] 173 mova m10, [ coeffq+ncoeffq*4+32] 174 packssdw m10, [ coeffq+ncoeffq*4+48] 175%else 176 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 177 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 178%endif 179 pabsw m6, m9 ; m6 = abs(m9) 180 pabsw m11, m10 ; m11 = abs(m10) 181 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 182 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 183%ifidn %1, b_32x32 184 pmovmskb r6d, m7 185 pmovmskb r2d, m12 186 or r6, r2 187 jz .skip_iter 188%endif 189 paddsw m6, m1 ; m6 += round 190 paddsw m11, m1 ; m11 += round 191 pmulhw m14, m6, m2 ; m14 = m6*q>>16 192 pmulhw m13, m11, m2 ; m13 = m11*q>>16 193 paddw m14, m6 ; m14 += m6 194 paddw m13, m11 ; m13 += m11 195 pmulhw m14, m4 ; m14 = m14*qsh>>16 196 pmulhw m13, m4 ; m13 = m13*qsh>>16 197 psignw m14, m9 ; m14 = reinsert sign 198 psignw m13, m10 ; m13 = reinsert sign 199 pand m14, m7 200 pand m13, m12 201%if CONFIG_VP9_HIGHBITDEPTH 202 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 203 pxor m11, m11 204 mova m11, m14 205 mova m6, m14 206 pcmpgtw m5, m14 207 punpcklwd m11, m5 208 punpckhwd m6, m5 209 mova [qcoeffq+ncoeffq*4+ 0], m11 210 mova [qcoeffq+ncoeffq*4+16], m6 211 pxor m5, m5 212 mova m11, m13 213 mova m6, m13 214 pcmpgtw m5, m13 215 punpcklwd m11, m5 216 punpckhwd m6, m5 217 mova [qcoeffq+ncoeffq*4+32], m11 218 mova [qcoeffq+ncoeffq*4+48], m6 219 pxor m5, m5 ; reset m5 to zero register 220%else 221 mova [qcoeffq+ncoeffq*2+ 0], m14 222 mova [qcoeffq+ncoeffq*2+16], m13 223%endif 224%ifidn %1, b_32x32 225 pabsw m14, m14 226 pabsw m13, m13 227%endif 228 pmullw m14, m3 ; dqc[i] = qc[i] * q 229 pmullw m13, m3 ; dqc[i] = qc[i] * q 230%ifidn %1, b_32x32 231 psrlw m14, 1 232 psrlw m13, 1 233 psignw m14, m9 234 psignw m13, m10 235%endif 236%if CONFIG_VP9_HIGHBITDEPTH 237 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 238 mova m11, m14 239 mova m6, m14 240 pcmpgtw m5, m14 241 punpcklwd m11, m5 242 punpckhwd m6, m5 243 mova [dqcoeffq+ncoeffq*4+ 0], m11 244 mova [dqcoeffq+ncoeffq*4+16], m6 245 pxor m5, m5 246 mova m11, m13 247 mova m6, m13 248 pcmpgtw m5, m13 249 punpcklwd m11, m5 250 punpckhwd m6, m5 251 mova [dqcoeffq+ncoeffq*4+32], m11 252 mova [dqcoeffq+ncoeffq*4+48], m6 253 pxor m5, m5 254%else 255 mova [dqcoeffq+ncoeffq*2+ 0], m14 256 mova [dqcoeffq+ncoeffq*2+16], m13 257%endif 258 pcmpeqw m14, m5 ; m14 = c[i] == 0 259 pcmpeqw m13, m5 ; m13 = c[i] == 0 260 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 261 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 262 psubw m6, m7 ; m6 = scan[i] + 1 263 psubw m11, m12 ; m11 = scan[i] + 1 264 pandn m14, m6 ; m14 = max(eob) 265 pandn m13, m11 ; m13 = max(eob) 266 pmaxsw m8, m14 267 pmaxsw m8, m13 268 add ncoeffq, mmsize 269 jl .ac_only_loop 270 271%ifidn %1, b_32x32 272 jmp .accumulate_eob 273.skip_iter: 274%if CONFIG_VP9_HIGHBITDEPTH 275 mova [qcoeffq+ncoeffq*4+ 0], m5 276 mova [qcoeffq+ncoeffq*4+16], m5 277 mova [qcoeffq+ncoeffq*4+32], m5 278 mova [qcoeffq+ncoeffq*4+48], m5 279 mova [dqcoeffq+ncoeffq*4+ 0], m5 280 mova [dqcoeffq+ncoeffq*4+16], m5 281 mova [dqcoeffq+ncoeffq*4+32], m5 282 mova [dqcoeffq+ncoeffq*4+48], m5 283%else 284 mova [qcoeffq+ncoeffq*2+ 0], m5 285 mova [qcoeffq+ncoeffq*2+16], m5 286 mova [dqcoeffq+ncoeffq*2+ 0], m5 287 mova [dqcoeffq+ncoeffq*2+16], m5 288%endif 289 add ncoeffq, mmsize 290 jl .ac_only_loop 291%endif 292 293.accumulate_eob: 294 ; horizontally accumulate/max eobs and write into [eob] memory pointer 295 mov r2, eobmp 296 pshufd m7, m8, 0xe 297 pmaxsw m8, m7 298 pshuflw m7, m8, 0xe 299 pmaxsw m8, m7 300 pshuflw m7, m8, 0x1 301 pmaxsw m8, m7 302 pextrw r6, m8, 0 303 mov [r2], r6 304 RET 305 306 ; skip-block, i.e. just write all zeroes 307.blank: 308 mov r0, dqcoeffmp 309 movifnidn ncoeffq, ncoeffmp 310 mov r2, qcoeffmp 311 mov r3, eobmp 312 DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 313%if CONFIG_VP9_HIGHBITDEPTH 314 lea dqcoeffq, [dqcoeffq+ncoeffq*4] 315 lea qcoeffq, [ qcoeffq+ncoeffq*4] 316%else 317 lea dqcoeffq, [dqcoeffq+ncoeffq*2] 318 lea qcoeffq, [ qcoeffq+ncoeffq*2] 319%endif 320 neg ncoeffq 321 pxor m7, m7 322.blank_loop: 323%if CONFIG_VP9_HIGHBITDEPTH 324 mova [dqcoeffq+ncoeffq*4+ 0], m7 325 mova [dqcoeffq+ncoeffq*4+16], m7 326 mova [dqcoeffq+ncoeffq*4+32], m7 327 mova [dqcoeffq+ncoeffq*4+48], m7 328 mova [qcoeffq+ncoeffq*4+ 0], m7 329 mova [qcoeffq+ncoeffq*4+16], m7 330 mova [qcoeffq+ncoeffq*4+32], m7 331 mova [qcoeffq+ncoeffq*4+48], m7 332%else 333 mova [dqcoeffq+ncoeffq*2+ 0], m7 334 mova [dqcoeffq+ncoeffq*2+16], m7 335 mova [qcoeffq+ncoeffq*2+ 0], m7 336 mova [qcoeffq+ncoeffq*2+16], m7 337%endif 338 add ncoeffq, mmsize 339 jl .blank_loop 340 mov word [eobq], 0 341 RET 342%endmacro 343 344INIT_XMM ssse3 345QUANTIZE_FN b, 7 346QUANTIZE_FN b_32x32, 7 347