1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%define private_prefix vp9 12 13%include "third_party/x86inc/x86inc.asm" 14 15SECTION_RODATA 16pw_1: times 8 dw 1 17 18SECTION .text 19 20%macro QUANTIZE_FP 2 21cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 22 shift, qcoeff, dqcoeff, dequant, \ 23 eob, scan, iscan 24 cmp dword skipm, 0 25 jne .blank 26 27 ; actual quantize loop - setup pointers, rounders, etc. 28 movifnidn coeffq, coeffmp 29 movifnidn ncoeffq, ncoeffmp 30 mov r2, dequantmp 31 movifnidn zbinq, zbinmp 32 movifnidn roundq, roundmp 33 movifnidn quantq, quantmp 34 mova m1, [roundq] ; m1 = round 35 mova m2, [quantq] ; m2 = quant 36%ifidn %1, fp_32x32 37 pcmpeqw m5, m5 38 psrlw m5, 15 39 paddw m1, m5 40 psrlw m1, 1 ; m1 = (m1 + 1) / 2 41%endif 42 mova m3, [r2q] ; m3 = dequant 43 mov r3, qcoeffmp 44 mov r4, dqcoeffmp 45 mov r5, iscanmp 46%ifidn %1, fp_32x32 47 psllw m2, 1 48%endif 49 pxor m5, m5 ; m5 = dedicated zero 50 51 lea coeffq, [ coeffq+ncoeffq*2] 52 lea r5q, [ r5q+ncoeffq*2] 53 lea r3q, [ r3q+ncoeffq*2] 54 lea r4q, [r4q+ncoeffq*2] 55 neg ncoeffq 56 57 ; get DC and first 15 AC coeffs 58 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 59 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 60 pabsw m6, m9 ; m6 = abs(m9) 61 pabsw m11, m10 ; m11 = abs(m10) 62 pcmpeqw m7, m7 63 64 paddsw m6, m1 ; m6 += round 65 punpckhqdq m1, m1 66 paddsw m11, m1 ; m11 += round 67 pmulhw m8, m6, m2 ; m8 = m6*q>>16 68 punpckhqdq m2, m2 69 pmulhw m13, m11, m2 ; m13 = m11*q>>16 70 psignw m8, m9 ; m8 = reinsert sign 71 psignw m13, m10 ; m13 = reinsert sign 72 mova [r3q+ncoeffq*2+ 0], m8 73 mova [r3q+ncoeffq*2+16], m13 74%ifidn %1, fp_32x32 75 pabsw m8, m8 76 pabsw m13, m13 77%endif 78 pmullw m8, m3 ; r4[i] = r3[i] * q 79 punpckhqdq m3, m3 80 pmullw m13, m3 ; r4[i] = r3[i] * q 81%ifidn %1, fp_32x32 82 psrlw m8, 1 83 psrlw m13, 1 84 psignw m8, m9 85 psignw m13, m10 86 psrlw m0, m3, 2 87%else 88 psrlw m0, m3, 1 89%endif 90 mova [r4q+ncoeffq*2+ 0], m8 91 mova [r4q+ncoeffq*2+16], m13 92 pcmpeqw m8, m5 ; m8 = c[i] == 0 93 pcmpeqw m13, m5 ; m13 = c[i] == 0 94 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] 95 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] 96 psubw m6, m7 ; m6 = scan[i] + 1 97 psubw m11, m7 ; m11 = scan[i] + 1 98 pandn m8, m6 ; m8 = max(eob) 99 pandn m13, m11 ; m13 = max(eob) 100 pmaxsw m8, m13 101 add ncoeffq, mmsize 102 jz .accumulate_eob 103 104.ac_only_loop: 105 mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 106 mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 107 pabsw m6, m9 ; m6 = abs(m9) 108 pabsw m11, m10 ; m11 = abs(m10) 109 110 pcmpgtw m7, m6, m0 111 pcmpgtw m12, m11, m0 112 pmovmskb r6d, m7 113 pmovmskb r2d, m12 114 115 or r6, r2 116 jz .skip_iter 117 118 pcmpeqw m7, m7 119 120 paddsw m6, m1 ; m6 += round 121 paddsw m11, m1 ; m11 += round 122 pmulhw m14, m6, m2 ; m14 = m6*q>>16 123 pmulhw m13, m11, m2 ; m13 = m11*q>>16 124 psignw m14, m9 ; m14 = reinsert sign 125 psignw m13, m10 ; m13 = reinsert sign 126 mova [r3q+ncoeffq*2+ 0], m14 127 mova [r3q+ncoeffq*2+16], m13 128%ifidn %1, fp_32x32 129 pabsw m14, m14 130 pabsw m13, m13 131%endif 132 pmullw m14, m3 ; r4[i] = r3[i] * q 133 pmullw m13, m3 ; r4[i] = r3[i] * q 134%ifidn %1, fp_32x32 135 psrlw m14, 1 136 psrlw m13, 1 137 psignw m14, m9 138 psignw m13, m10 139%endif 140 mova [r4q+ncoeffq*2+ 0], m14 141 mova [r4q+ncoeffq*2+16], m13 142 pcmpeqw m14, m5 ; m14 = c[i] == 0 143 pcmpeqw m13, m5 ; m13 = c[i] == 0 144 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] 145 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] 146 psubw m6, m7 ; m6 = scan[i] + 1 147 psubw m11, m7 ; m11 = scan[i] + 1 148 pandn m14, m6 ; m14 = max(eob) 149 pandn m13, m11 ; m13 = max(eob) 150 pmaxsw m8, m14 151 pmaxsw m8, m13 152 add ncoeffq, mmsize 153 jl .ac_only_loop 154 155 jmp .accumulate_eob 156.skip_iter: 157 mova [r3q+ncoeffq*2+ 0], m5 158 mova [r3q+ncoeffq*2+16], m5 159 mova [r4q+ncoeffq*2+ 0], m5 160 mova [r4q+ncoeffq*2+16], m5 161 add ncoeffq, mmsize 162 jl .ac_only_loop 163 164.accumulate_eob: 165 ; horizontally accumulate/max eobs and write into [eob] memory pointer 166 mov r2, eobmp 167 pshufd m7, m8, 0xe 168 pmaxsw m8, m7 169 pshuflw m7, m8, 0xe 170 pmaxsw m8, m7 171 pshuflw m7, m8, 0x1 172 pmaxsw m8, m7 173 pextrw r6, m8, 0 174 mov [r2], r6 175 RET 176 177 ; skip-block, i.e. just write all zeroes 178.blank: 179 mov r0, dqcoeffmp 180 movifnidn ncoeffq, ncoeffmp 181 mov r2, qcoeffmp 182 mov r3, eobmp 183 184 lea r0q, [r0q+ncoeffq*2] 185 lea r2q, [r2q+ncoeffq*2] 186 neg ncoeffq 187 pxor m7, m7 188.blank_loop: 189 mova [r0q+ncoeffq*2+ 0], m7 190 mova [r0q+ncoeffq*2+16], m7 191 mova [r2q+ncoeffq*2+ 0], m7 192 mova [r2q+ncoeffq*2+16], m7 193 add ncoeffq, mmsize 194 jl .blank_loop 195 mov word [r3q], 0 196 RET 197%endmacro 198 199INIT_XMM ssse3 200QUANTIZE_FP fp, 7 201QUANTIZE_FP fp_32x32, 7 202