12263fc984bdc858ee931d3e35c87c404de923950Johann; 22263fc984bdc858ee931d3e35c87c404de923950Johann; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 32263fc984bdc858ee931d3e35c87c404de923950Johann; 42263fc984bdc858ee931d3e35c87c404de923950Johann; Use of this source code is governed by a BSD-style license 52263fc984bdc858ee931d3e35c87c404de923950Johann; that can be found in the LICENSE file in the root of the source 62263fc984bdc858ee931d3e35c87c404de923950Johann; tree. An additional intellectual property rights grant can be found 72263fc984bdc858ee931d3e35c87c404de923950Johann; in the file PATENTS. All contributing project authors may 82263fc984bdc858ee931d3e35c87c404de923950Johann; be found in the AUTHORS file in the root of the source tree. 92263fc984bdc858ee931d3e35c87c404de923950Johann; 102263fc984bdc858ee931d3e35c87c404de923950Johann 112263fc984bdc858ee931d3e35c87c404de923950Johann%include "third_party/x86inc/x86inc.asm" 122263fc984bdc858ee931d3e35c87c404de923950Johann 132263fc984bdc858ee931d3e35c87c404de923950JohannSECTION .text 142263fc984bdc858ee931d3e35c87c404de923950Johann 152263fc984bdc858ee931d3e35c87c404de923950Johann%macro QUANTIZE_FN 2 162263fc984bdc858ee931d3e35c87c404de923950Johanncglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 172263fc984bdc858ee931d3e35c87c404de923950Johann shift, qcoeff, dqcoeff, dequant, \ 182263fc984bdc858ee931d3e35c87c404de923950Johann eob, scan, iscan 192263fc984bdc858ee931d3e35c87c404de923950Johann 202263fc984bdc858ee931d3e35c87c404de923950Johann vzeroupper 212263fc984bdc858ee931d3e35c87c404de923950Johann 222263fc984bdc858ee931d3e35c87c404de923950Johann ; If we can skip this block, then just zero the output 232263fc984bdc858ee931d3e35c87c404de923950Johann cmp skipmp, 0 242263fc984bdc858ee931d3e35c87c404de923950Johann jne .blank 252263fc984bdc858ee931d3e35c87c404de923950Johann 262263fc984bdc858ee931d3e35c87c404de923950Johann%ifnidn %1, b_32x32 272263fc984bdc858ee931d3e35c87c404de923950Johann 282263fc984bdc858ee931d3e35c87c404de923950Johann ; Special case for ncoeff == 16, as it is frequent and we can save on 292263fc984bdc858ee931d3e35c87c404de923950Johann ; not setting up a loop. 302263fc984bdc858ee931d3e35c87c404de923950Johann cmp ncoeffmp, 16 312263fc984bdc858ee931d3e35c87c404de923950Johann jne .generic 322263fc984bdc858ee931d3e35c87c404de923950Johann 332263fc984bdc858ee931d3e35c87c404de923950Johann ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 342263fc984bdc858ee931d3e35c87c404de923950Johann ;; Special case of ncoeff == 16 352263fc984bdc858ee931d3e35c87c404de923950Johann ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 362263fc984bdc858ee931d3e35c87c404de923950Johann 372263fc984bdc858ee931d3e35c87c404de923950Johann.single: 382263fc984bdc858ee931d3e35c87c404de923950Johann 392263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn coeffq, coeffmp 402263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn zbinq, zbinmp 412263fc984bdc858ee931d3e35c87c404de923950Johann mova m0, [zbinq] ; m0 = zbin 422263fc984bdc858ee931d3e35c87c404de923950Johann 432263fc984bdc858ee931d3e35c87c404de923950Johann ; Get DC and first 15 AC coeffs - in this special case, that is all. 442263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 452263fc984bdc858ee931d3e35c87c404de923950Johann ; coeff stored as 32bit numbers but we process them as 16 bit numbers 462263fc984bdc858ee931d3e35c87c404de923950Johann mova m9, [coeffq] 472263fc984bdc858ee931d3e35c87c404de923950Johann packssdw m9, [coeffq+16] ; m9 = c[i] 482263fc984bdc858ee931d3e35c87c404de923950Johann mova m10, [coeffq+32] 492263fc984bdc858ee931d3e35c87c404de923950Johann packssdw m10, [coeffq+48] ; m10 = c[i] 502263fc984bdc858ee931d3e35c87c404de923950Johann%else 512263fc984bdc858ee931d3e35c87c404de923950Johann mova m9, [coeffq] ; m9 = c[i] 522263fc984bdc858ee931d3e35c87c404de923950Johann mova m10, [coeffq+16] ; m10 = c[i] 532263fc984bdc858ee931d3e35c87c404de923950Johann%endif 542263fc984bdc858ee931d3e35c87c404de923950Johann 552263fc984bdc858ee931d3e35c87c404de923950Johann mov r0, eobmp ; Output pointer 562263fc984bdc858ee931d3e35c87c404de923950Johann mov r1, qcoeffmp ; Output pointer 572263fc984bdc858ee931d3e35c87c404de923950Johann mov r2, dqcoeffmp ; Output pointer 582263fc984bdc858ee931d3e35c87c404de923950Johann 592263fc984bdc858ee931d3e35c87c404de923950Johann pxor m5, m5 ; m5 = dedicated zero 602263fc984bdc858ee931d3e35c87c404de923950Johann 612263fc984bdc858ee931d3e35c87c404de923950Johann pcmpeqw m4, m4 ; All word lanes -1 622263fc984bdc858ee931d3e35c87c404de923950Johann paddw m0, m4 ; m0 = zbin - 1 632263fc984bdc858ee931d3e35c87c404de923950Johann 642263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m6, m9 ; m6 = abs(m9) 652263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m11, m10 ; m11 = abs(m10) 662263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 672263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m0, m0 682263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 692263fc984bdc858ee931d3e35c87c404de923950Johann 702263fc984bdc858ee931d3e35c87c404de923950Johann ; Check if all coeffs are less than zbin. If yes, we just write zeros 712263fc984bdc858ee931d3e35c87c404de923950Johann ; to the outputs and we are done. 722263fc984bdc858ee931d3e35c87c404de923950Johann por m14, m7, m12 732263fc984bdc858ee931d3e35c87c404de923950Johann ptest m14, m14 742263fc984bdc858ee931d3e35c87c404de923950Johann jnz .single_nonzero 752263fc984bdc858ee931d3e35c87c404de923950Johann 762263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 772263fc984bdc858ee931d3e35c87c404de923950Johann mova [r1 ], ymm5 782263fc984bdc858ee931d3e35c87c404de923950Johann mova [r1+32], ymm5 792263fc984bdc858ee931d3e35c87c404de923950Johann mova [r2 ], ymm5 802263fc984bdc858ee931d3e35c87c404de923950Johann mova [r2+32], ymm5 812263fc984bdc858ee931d3e35c87c404de923950Johann%else 822263fc984bdc858ee931d3e35c87c404de923950Johann mova [r1], ymm5 832263fc984bdc858ee931d3e35c87c404de923950Johann mova [r2], ymm5 842263fc984bdc858ee931d3e35c87c404de923950Johann%endif 852263fc984bdc858ee931d3e35c87c404de923950Johann mov [r0], word 0 862263fc984bdc858ee931d3e35c87c404de923950Johann 872263fc984bdc858ee931d3e35c87c404de923950Johann vzeroupper 882263fc984bdc858ee931d3e35c87c404de923950Johann RET 892263fc984bdc858ee931d3e35c87c404de923950Johann 902263fc984bdc858ee931d3e35c87c404de923950Johann.single_nonzero: 912263fc984bdc858ee931d3e35c87c404de923950Johann 922263fc984bdc858ee931d3e35c87c404de923950Johann ; Actual quantization of size 16 block - setup pointers, rounders, etc. 932263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn r4, roundmp 942263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn r5, quantmp 952263fc984bdc858ee931d3e35c87c404de923950Johann mov r3, dequantmp 962263fc984bdc858ee931d3e35c87c404de923950Johann mov r6, shiftmp 972263fc984bdc858ee931d3e35c87c404de923950Johann mova m1, [r4] ; m1 = round 982263fc984bdc858ee931d3e35c87c404de923950Johann mova m2, [r5] ; m2 = quant 992263fc984bdc858ee931d3e35c87c404de923950Johann mova m3, [r3] ; m3 = dequant 1002263fc984bdc858ee931d3e35c87c404de923950Johann mova m4, [r6] ; m4 = shift 1012263fc984bdc858ee931d3e35c87c404de923950Johann 1022263fc984bdc858ee931d3e35c87c404de923950Johann mov r3, iscanmp 1032263fc984bdc858ee931d3e35c87c404de923950Johann 1042263fc984bdc858ee931d3e35c87c404de923950Johann DEFINE_ARGS eob, qcoeff, dqcoeff, iscan 1052263fc984bdc858ee931d3e35c87c404de923950Johann 1062263fc984bdc858ee931d3e35c87c404de923950Johann ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1072263fc984bdc858ee931d3e35c87c404de923950Johann 1082263fc984bdc858ee931d3e35c87c404de923950Johann paddsw m6, m1 ; m6 += round 1092263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m1, m1 1102263fc984bdc858ee931d3e35c87c404de923950Johann paddsw m11, m1 ; m11 += round 1112263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m8, m6, m2 ; m8 = m6*q>>16 1122263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m2, m2 1132263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m13, m11, m2 ; m13 = m11*q>>16 1142263fc984bdc858ee931d3e35c87c404de923950Johann paddw m8, m6 ; m8 += m6 1152263fc984bdc858ee931d3e35c87c404de923950Johann paddw m13, m11 ; m13 += m11 1162263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m8, m4 ; m8 = m8*qsh>>16 1172263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m4, m4 1182263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m13, m4 ; m13 = m13*qsh>>16 1192263fc984bdc858ee931d3e35c87c404de923950Johann psignw m8, m9 ; m8 = reinsert sign 1202263fc984bdc858ee931d3e35c87c404de923950Johann psignw m13, m10 ; m13 = reinsert sign 1212263fc984bdc858ee931d3e35c87c404de923950Johann pand m8, m7 1222263fc984bdc858ee931d3e35c87c404de923950Johann pand m13, m12 1232263fc984bdc858ee931d3e35c87c404de923950Johann 1242263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 1252263fc984bdc858ee931d3e35c87c404de923950Johann ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff 1262263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m8 1272263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m8, m6 1282263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m8 1292263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq ], m11 1302263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+16], m6 1312263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m13 1322263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m13, m6 1332263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m13 1342263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+32], m11 1352263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+48], m6 1362263fc984bdc858ee931d3e35c87c404de923950Johann%else 1372263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq ], m8 1382263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+16], m13 1392263fc984bdc858ee931d3e35c87c404de923950Johann%endif 1402263fc984bdc858ee931d3e35c87c404de923950Johann 1412263fc984bdc858ee931d3e35c87c404de923950Johann pmullw m8, m3 ; dqc[i] = qc[i] * q 1422263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m3, m3 1432263fc984bdc858ee931d3e35c87c404de923950Johann pmullw m13, m3 ; dqc[i] = qc[i] * q 1442263fc984bdc858ee931d3e35c87c404de923950Johann 1452263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 1462263fc984bdc858ee931d3e35c87c404de923950Johann ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff 1472263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m8 1482263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m8, m6 1492263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m8 1502263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq ], m11 1512263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+16], m6 1522263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m13 1532263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m13, m6 1542263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m13 1552263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+32], m11 1562263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+48], m6 1572263fc984bdc858ee931d3e35c87c404de923950Johann%else 1582263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq ], m8 1592263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+16], m13 1602263fc984bdc858ee931d3e35c87c404de923950Johann%endif 1612263fc984bdc858ee931d3e35c87c404de923950Johann 1622263fc984bdc858ee931d3e35c87c404de923950Johann mova m6, [iscanq] ; m6 = scan[i] 1632263fc984bdc858ee931d3e35c87c404de923950Johann mova m11, [iscanq+16] ; m11 = scan[i] 1642263fc984bdc858ee931d3e35c87c404de923950Johann 1652263fc984bdc858ee931d3e35c87c404de923950Johann pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 1662263fc984bdc858ee931d3e35c87c404de923950Johann pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 1672263fc984bdc858ee931d3e35c87c404de923950Johann psubw m6, m6, m7 ; m6 = scan[i] + 1 1682263fc984bdc858ee931d3e35c87c404de923950Johann psubw m11, m11, m12 ; m11 = scan[i] + 1 1692263fc984bdc858ee931d3e35c87c404de923950Johann pandn m8, m8, m6 ; m8 = max(eob) 1702263fc984bdc858ee931d3e35c87c404de923950Johann pandn m13, m13, m11 ; m13 = max(eob) 1712263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m8, m13 1722263fc984bdc858ee931d3e35c87c404de923950Johann 1732263fc984bdc858ee931d3e35c87c404de923950Johann ; Horizontally accumulate/max eobs and write into [eob] memory pointer 1742263fc984bdc858ee931d3e35c87c404de923950Johann pshufd m7, m8, 0xe 1752263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 1762263fc984bdc858ee931d3e35c87c404de923950Johann pshuflw m7, m8, 0xe 1772263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 1782263fc984bdc858ee931d3e35c87c404de923950Johann pshuflw m7, m8, 0x1 1792263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 1802263fc984bdc858ee931d3e35c87c404de923950Johann movq rax, m8 1812263fc984bdc858ee931d3e35c87c404de923950Johann mov [eobq], ax 1822263fc984bdc858ee931d3e35c87c404de923950Johann 1832263fc984bdc858ee931d3e35c87c404de923950Johann vzeroupper 1842263fc984bdc858ee931d3e35c87c404de923950Johann RET 1852263fc984bdc858ee931d3e35c87c404de923950Johann 1862263fc984bdc858ee931d3e35c87c404de923950Johann ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1872263fc984bdc858ee931d3e35c87c404de923950Johann ;; Generic case of ncoeff != 16 1882263fc984bdc858ee931d3e35c87c404de923950Johann ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1892263fc984bdc858ee931d3e35c87c404de923950Johann 1902263fc984bdc858ee931d3e35c87c404de923950Johann.generic: 1912263fc984bdc858ee931d3e35c87c404de923950Johann 1922263fc984bdc858ee931d3e35c87c404de923950Johann%endif ; %ifnidn %1, b_32x32 1932263fc984bdc858ee931d3e35c87c404de923950Johann 1942263fc984bdc858ee931d3e35c87c404de923950JohannDEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ 1952263fc984bdc858ee931d3e35c87c404de923950Johann qcoeff, dqcoeff, dequant, eob, scan, iscan 1962263fc984bdc858ee931d3e35c87c404de923950Johann 1972263fc984bdc858ee931d3e35c87c404de923950Johann ; Actual quantization loop - setup pointers, rounders, etc. 1982263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn coeffq, coeffmp 1992263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn ncoeffq, ncoeffmp 2002263fc984bdc858ee931d3e35c87c404de923950Johann mov r2, dequantmp 2012263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn zbinq, zbinmp 2022263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn roundq, roundmp 2032263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn quantq, quantmp 2042263fc984bdc858ee931d3e35c87c404de923950Johann mova m0, [zbinq] ; m0 = zbin 2052263fc984bdc858ee931d3e35c87c404de923950Johann mova m1, [roundq] ; m1 = round 2062263fc984bdc858ee931d3e35c87c404de923950Johann mova m2, [quantq] ; m2 = quant 2072263fc984bdc858ee931d3e35c87c404de923950Johann mova m3, [r2] ; m3 = dequant 2082263fc984bdc858ee931d3e35c87c404de923950Johann pcmpeqw m4, m4 ; All lanes -1 2092263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32 2102263fc984bdc858ee931d3e35c87c404de923950Johann psubw m0, m4 2112263fc984bdc858ee931d3e35c87c404de923950Johann psubw m1, m4 2122263fc984bdc858ee931d3e35c87c404de923950Johann psrlw m0, 1 ; m0 = (m0 + 1) / 2 2132263fc984bdc858ee931d3e35c87c404de923950Johann psrlw m1, 1 ; m1 = (m1 + 1) / 2 2142263fc984bdc858ee931d3e35c87c404de923950Johann%endif 2152263fc984bdc858ee931d3e35c87c404de923950Johann paddw m0, m4 ; m0 = m0 + 1 2162263fc984bdc858ee931d3e35c87c404de923950Johann 2172263fc984bdc858ee931d3e35c87c404de923950Johann mov r2, shiftmp 2182263fc984bdc858ee931d3e35c87c404de923950Johann mov r3, qcoeffmp 2192263fc984bdc858ee931d3e35c87c404de923950Johann mova m4, [r2] ; m4 = shift 2202263fc984bdc858ee931d3e35c87c404de923950Johann mov r4, dqcoeffmp 2212263fc984bdc858ee931d3e35c87c404de923950Johann mov r5, iscanmp 2222263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32 2232263fc984bdc858ee931d3e35c87c404de923950Johann psllw m4, 1 2242263fc984bdc858ee931d3e35c87c404de923950Johann%endif 2252263fc984bdc858ee931d3e35c87c404de923950Johann pxor m5, m5 ; m5 = dedicated zero 2262263fc984bdc858ee931d3e35c87c404de923950Johann 2272263fc984bdc858ee931d3e35c87c404de923950Johann DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob 2282263fc984bdc858ee931d3e35c87c404de923950Johann 2292263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 2302263fc984bdc858ee931d3e35c87c404de923950Johann lea coeffq, [ coeffq+ncoeffq*4] 2312263fc984bdc858ee931d3e35c87c404de923950Johann lea qcoeffq, [ qcoeffq+ncoeffq*4] 2322263fc984bdc858ee931d3e35c87c404de923950Johann lea dqcoeffq, [dqcoeffq+ncoeffq*4] 2332263fc984bdc858ee931d3e35c87c404de923950Johann%else 2342263fc984bdc858ee931d3e35c87c404de923950Johann lea coeffq, [ coeffq+ncoeffq*2] 2352263fc984bdc858ee931d3e35c87c404de923950Johann lea qcoeffq, [ qcoeffq+ncoeffq*2] 2362263fc984bdc858ee931d3e35c87c404de923950Johann lea dqcoeffq, [dqcoeffq+ncoeffq*2] 2372263fc984bdc858ee931d3e35c87c404de923950Johann%endif 2382263fc984bdc858ee931d3e35c87c404de923950Johann lea iscanq, [ iscanq+ncoeffq*2] 2392263fc984bdc858ee931d3e35c87c404de923950Johann neg ncoeffq 2402263fc984bdc858ee931d3e35c87c404de923950Johann 2412263fc984bdc858ee931d3e35c87c404de923950Johann ; get DC and first 15 AC coeffs 2422263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 2432263fc984bdc858ee931d3e35c87c404de923950Johann ; coeff stored as 32bit numbers & require 16bit numbers 2442263fc984bdc858ee931d3e35c87c404de923950Johann mova m9, [coeffq+ncoeffq*4+ 0] 2452263fc984bdc858ee931d3e35c87c404de923950Johann packssdw m9, [coeffq+ncoeffq*4+16] 2462263fc984bdc858ee931d3e35c87c404de923950Johann mova m10, [coeffq+ncoeffq*4+32] 2472263fc984bdc858ee931d3e35c87c404de923950Johann packssdw m10, [coeffq+ncoeffq*4+48] 2482263fc984bdc858ee931d3e35c87c404de923950Johann%else 2492263fc984bdc858ee931d3e35c87c404de923950Johann mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] 2502263fc984bdc858ee931d3e35c87c404de923950Johann mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] 2512263fc984bdc858ee931d3e35c87c404de923950Johann%endif 2522263fc984bdc858ee931d3e35c87c404de923950Johann 2532263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m6, m9 ; m6 = abs(m9) 2542263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m11, m10 ; m11 = abs(m10) 2552263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 2562263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m0, m0 2572263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 2582263fc984bdc858ee931d3e35c87c404de923950Johann 2592263fc984bdc858ee931d3e35c87c404de923950Johann ; Check if all coeffs are less than zbin. If yes, skip forward quickly. 2602263fc984bdc858ee931d3e35c87c404de923950Johann por m14, m7, m12 2612263fc984bdc858ee931d3e35c87c404de923950Johann ptest m14, m14 2622263fc984bdc858ee931d3e35c87c404de923950Johann jnz .first_nonzero 2632263fc984bdc858ee931d3e35c87c404de923950Johann 2642263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 2652263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4 ], ymm5 2662263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+32], ymm5 2672263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4 ], ymm5 2682263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+32], ymm5 2692263fc984bdc858ee931d3e35c87c404de923950Johann%else 2702263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*2], ymm5 2712263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*2], ymm5 2722263fc984bdc858ee931d3e35c87c404de923950Johann%endif 2732263fc984bdc858ee931d3e35c87c404de923950Johann 2742263fc984bdc858ee931d3e35c87c404de923950Johann add ncoeffq, mmsize 2752263fc984bdc858ee931d3e35c87c404de923950Johann 2762263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m1, m1 2772263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m2, m2 2782263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m3, m3 2792263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m4, m4 2802263fc984bdc858ee931d3e35c87c404de923950Johann pxor m8, m8 2812263fc984bdc858ee931d3e35c87c404de923950Johann 2822263fc984bdc858ee931d3e35c87c404de923950Johann jmp .ac_only_loop 2832263fc984bdc858ee931d3e35c87c404de923950Johann 2842263fc984bdc858ee931d3e35c87c404de923950Johann.first_nonzero: 2852263fc984bdc858ee931d3e35c87c404de923950Johann 2862263fc984bdc858ee931d3e35c87c404de923950Johann paddsw m6, m1 ; m6 += round 2872263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m1, m1 2882263fc984bdc858ee931d3e35c87c404de923950Johann paddsw m11, m1 ; m11 += round 2892263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m8, m6, m2 ; m8 = m6*q>>16 2902263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m2, m2 2912263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m13, m11, m2 ; m13 = m11*q>>16 2922263fc984bdc858ee931d3e35c87c404de923950Johann paddw m8, m6 ; m8 += m6 2932263fc984bdc858ee931d3e35c87c404de923950Johann paddw m13, m11 ; m13 += m11 2942263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m8, m4 ; m8 = m8*qsh>>16 2952263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m4, m4 2962263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m13, m4 ; m13 = m13*qsh>>16 2972263fc984bdc858ee931d3e35c87c404de923950Johann psignw m8, m9 ; m8 = reinsert sign 2982263fc984bdc858ee931d3e35c87c404de923950Johann psignw m13, m10 ; m13 = reinsert sign 2992263fc984bdc858ee931d3e35c87c404de923950Johann pand m8, m7 3002263fc984bdc858ee931d3e35c87c404de923950Johann pand m13, m12 3012263fc984bdc858ee931d3e35c87c404de923950Johann 3022263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 3032263fc984bdc858ee931d3e35c87c404de923950Johann ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 3042263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m8 3052263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m8, m6 3062263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m8 3072263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+ 0], m11 3082263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+16], m6 3092263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m13 3102263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m13, m6 3112263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m13 3122263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+32], m11 3132263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+48], m6 3142263fc984bdc858ee931d3e35c87c404de923950Johann%else 3152263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*2+ 0], m8 3162263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*2+16], m13 3172263fc984bdc858ee931d3e35c87c404de923950Johann%endif 3182263fc984bdc858ee931d3e35c87c404de923950Johann 3192263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32 3202263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m8, m8 3212263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m13, m13 3222263fc984bdc858ee931d3e35c87c404de923950Johann%endif 3232263fc984bdc858ee931d3e35c87c404de923950Johann pmullw m8, m3 ; dqc[i] = qc[i] * q 3242263fc984bdc858ee931d3e35c87c404de923950Johann punpckhqdq m3, m3 3252263fc984bdc858ee931d3e35c87c404de923950Johann pmullw m13, m3 ; dqc[i] = qc[i] * q 3262263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32 3272263fc984bdc858ee931d3e35c87c404de923950Johann psrlw m8, 1 3282263fc984bdc858ee931d3e35c87c404de923950Johann psrlw m13, 1 3292263fc984bdc858ee931d3e35c87c404de923950Johann psignw m8, m9 3302263fc984bdc858ee931d3e35c87c404de923950Johann psignw m13, m10 3312263fc984bdc858ee931d3e35c87c404de923950Johann%endif 3322263fc984bdc858ee931d3e35c87c404de923950Johann 3332263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 3342263fc984bdc858ee931d3e35c87c404de923950Johann ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 3352263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m8 3362263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m8, m6 3372263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m8 3382263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+ 0], m11 3392263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+16], m6 3402263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m13 3412263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m13, m6 3422263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m13 3432263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+32], m11 3442263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+48], m6 3452263fc984bdc858ee931d3e35c87c404de923950Johann%else 3462263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*2+ 0], m8 3472263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*2+16], m13 3482263fc984bdc858ee931d3e35c87c404de923950Johann%endif 3492263fc984bdc858ee931d3e35c87c404de923950Johann 3502263fc984bdc858ee931d3e35c87c404de923950Johann pcmpeqw m8, m5 ; m8 = c[i] == 0 3512263fc984bdc858ee931d3e35c87c404de923950Johann pcmpeqw m13, m5 ; m13 = c[i] == 0 3522263fc984bdc858ee931d3e35c87c404de923950Johann mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] 3532263fc984bdc858ee931d3e35c87c404de923950Johann mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] 3542263fc984bdc858ee931d3e35c87c404de923950Johann psubw m6, m7 ; m6 = scan[i] + 1 3552263fc984bdc858ee931d3e35c87c404de923950Johann psubw m11, m12 ; m11 = scan[i] + 1 3562263fc984bdc858ee931d3e35c87c404de923950Johann pandn m8, m6 ; m8 = max(eob) 3572263fc984bdc858ee931d3e35c87c404de923950Johann pandn m13, m11 ; m13 = max(eob) 3582263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m13 3592263fc984bdc858ee931d3e35c87c404de923950Johann add ncoeffq, mmsize 3602263fc984bdc858ee931d3e35c87c404de923950Johann 3612263fc984bdc858ee931d3e35c87c404de923950Johann.ac_only_loop: 3622263fc984bdc858ee931d3e35c87c404de923950Johann 3632263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 3642263fc984bdc858ee931d3e35c87c404de923950Johann ; pack coeff from 32bit to 16bit array 3652263fc984bdc858ee931d3e35c87c404de923950Johann mova m9, [coeffq+ncoeffq*4+ 0] 3662263fc984bdc858ee931d3e35c87c404de923950Johann packssdw m9, [coeffq+ncoeffq*4+16] 3672263fc984bdc858ee931d3e35c87c404de923950Johann mova m10, [coeffq+ncoeffq*4+32] 3682263fc984bdc858ee931d3e35c87c404de923950Johann packssdw m10, [coeffq+ncoeffq*4+48] 3692263fc984bdc858ee931d3e35c87c404de923950Johann%else 3702263fc984bdc858ee931d3e35c87c404de923950Johann mova m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i] 3712263fc984bdc858ee931d3e35c87c404de923950Johann mova m10, [coeffq+ncoeffq*2+16] ; m10 = c[i] 3722263fc984bdc858ee931d3e35c87c404de923950Johann%endif 3732263fc984bdc858ee931d3e35c87c404de923950Johann 3742263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m6, m9 ; m6 = abs(m9) 3752263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m11, m10 ; m11 = abs(m10) 3762263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 3772263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 3782263fc984bdc858ee931d3e35c87c404de923950Johann 3792263fc984bdc858ee931d3e35c87c404de923950Johann ; Check if all coeffs are less than zbin. If yes, skip this itertion. 3802263fc984bdc858ee931d3e35c87c404de923950Johann ; And just write zeros as the result would be. 3812263fc984bdc858ee931d3e35c87c404de923950Johann por m14, m7, m12 3822263fc984bdc858ee931d3e35c87c404de923950Johann ptest m14, m14 3832263fc984bdc858ee931d3e35c87c404de923950Johann jnz .rest_nonzero 3842263fc984bdc858ee931d3e35c87c404de923950Johann 3852263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 3862263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+ 0], ymm5 3872263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+32], ymm5 3882263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+ 0], ymm5 3892263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+32], ymm5 3902263fc984bdc858ee931d3e35c87c404de923950Johann%else 3912263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*2+ 0], ymm5 3922263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*2+ 0], ymm5 3932263fc984bdc858ee931d3e35c87c404de923950Johann%endif 3942263fc984bdc858ee931d3e35c87c404de923950Johann add ncoeffq, mmsize 3952263fc984bdc858ee931d3e35c87c404de923950Johann jnz .ac_only_loop 3962263fc984bdc858ee931d3e35c87c404de923950Johann 3972263fc984bdc858ee931d3e35c87c404de923950Johann ; Horizontally accumulate/max eobs and write into [eob] memory pointer 3982263fc984bdc858ee931d3e35c87c404de923950Johann mov r2, eobmp 3992263fc984bdc858ee931d3e35c87c404de923950Johann pshufd m7, m8, 0xe 4002263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 4012263fc984bdc858ee931d3e35c87c404de923950Johann pshuflw m7, m8, 0xe 4022263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 4032263fc984bdc858ee931d3e35c87c404de923950Johann pshuflw m7, m8, 0x1 4042263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 4052263fc984bdc858ee931d3e35c87c404de923950Johann movq rax, m8 4062263fc984bdc858ee931d3e35c87c404de923950Johann mov [r2], ax 4072263fc984bdc858ee931d3e35c87c404de923950Johann vzeroupper 4082263fc984bdc858ee931d3e35c87c404de923950Johann RET 4092263fc984bdc858ee931d3e35c87c404de923950Johann 4102263fc984bdc858ee931d3e35c87c404de923950Johann.rest_nonzero: 4112263fc984bdc858ee931d3e35c87c404de923950Johann paddsw m6, m1 ; m6 += round 4122263fc984bdc858ee931d3e35c87c404de923950Johann paddsw m11, m1 ; m11 += round 4132263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m14, m6, m2 ; m14 = m6*q>>16 4142263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m13, m11, m2 ; m13 = m11*q>>16 4152263fc984bdc858ee931d3e35c87c404de923950Johann paddw m14, m6 ; m14 += m6 4162263fc984bdc858ee931d3e35c87c404de923950Johann paddw m13, m11 ; m13 += m11 4172263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m14, m4 ; m14 = m14*qsh>>16 4182263fc984bdc858ee931d3e35c87c404de923950Johann pmulhw m13, m4 ; m13 = m13*qsh>>16 4192263fc984bdc858ee931d3e35c87c404de923950Johann psignw m14, m9 ; m14 = reinsert sign 4202263fc984bdc858ee931d3e35c87c404de923950Johann psignw m13, m10 ; m13 = reinsert sign 4212263fc984bdc858ee931d3e35c87c404de923950Johann pand m14, m7 4222263fc984bdc858ee931d3e35c87c404de923950Johann pand m13, m12 4232263fc984bdc858ee931d3e35c87c404de923950Johann 4242263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 4252263fc984bdc858ee931d3e35c87c404de923950Johann ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 4262263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m14 4272263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m14, m6 4282263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m14 4292263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+ 0], m11 4302263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+16], m6 4312263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m13 4322263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m13, m6 4332263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m13 4342263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+32], m11 4352263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+48], m6 4362263fc984bdc858ee931d3e35c87c404de923950Johann%else 4372263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*2+ 0], m14 4382263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*2+16], m13 4392263fc984bdc858ee931d3e35c87c404de923950Johann%endif 4402263fc984bdc858ee931d3e35c87c404de923950Johann 4412263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32 4422263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m14, m14 4432263fc984bdc858ee931d3e35c87c404de923950Johann pabsw m13, m13 4442263fc984bdc858ee931d3e35c87c404de923950Johann%endif 4452263fc984bdc858ee931d3e35c87c404de923950Johann pmullw m14, m3 ; dqc[i] = qc[i] * q 4462263fc984bdc858ee931d3e35c87c404de923950Johann pmullw m13, m3 ; dqc[i] = qc[i] * q 4472263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32 4482263fc984bdc858ee931d3e35c87c404de923950Johann psrlw m14, 1 4492263fc984bdc858ee931d3e35c87c404de923950Johann psrlw m13, 1 4502263fc984bdc858ee931d3e35c87c404de923950Johann psignw m14, m9 4512263fc984bdc858ee931d3e35c87c404de923950Johann psignw m13, m10 4522263fc984bdc858ee931d3e35c87c404de923950Johann%endif 4532263fc984bdc858ee931d3e35c87c404de923950Johann 4542263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 4552263fc984bdc858ee931d3e35c87c404de923950Johann ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 4562263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m14 4572263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m14, m6 4582263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m14 4592263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+ 0], m11 4602263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+16], m6 4612263fc984bdc858ee931d3e35c87c404de923950Johann pcmpgtw m6, m5, m13 4622263fc984bdc858ee931d3e35c87c404de923950Johann punpckhwd m6, m13, m6 4632263fc984bdc858ee931d3e35c87c404de923950Johann pmovsxwd m11, m13 4642263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+32], m11 4652263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+48], m6 4662263fc984bdc858ee931d3e35c87c404de923950Johann%else 4672263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*2+ 0], m14 4682263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*2+16], m13 4692263fc984bdc858ee931d3e35c87c404de923950Johann%endif 4702263fc984bdc858ee931d3e35c87c404de923950Johann 4712263fc984bdc858ee931d3e35c87c404de923950Johann pcmpeqw m14, m5 ; m14 = c[i] == 0 4722263fc984bdc858ee931d3e35c87c404de923950Johann pcmpeqw m13, m5 ; m13 = c[i] == 0 4732263fc984bdc858ee931d3e35c87c404de923950Johann mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 4742263fc984bdc858ee931d3e35c87c404de923950Johann mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] 4752263fc984bdc858ee931d3e35c87c404de923950Johann psubw m6, m7 ; m6 = scan[i] + 1 4762263fc984bdc858ee931d3e35c87c404de923950Johann psubw m11, m12 ; m11 = scan[i] + 1 4772263fc984bdc858ee931d3e35c87c404de923950Johann pandn m14, m6 ; m14 = max(eob) 4782263fc984bdc858ee931d3e35c87c404de923950Johann pandn m13, m11 ; m13 = max(eob) 4792263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m14 4802263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m13 4812263fc984bdc858ee931d3e35c87c404de923950Johann add ncoeffq, mmsize 4822263fc984bdc858ee931d3e35c87c404de923950Johann jnz .ac_only_loop 4832263fc984bdc858ee931d3e35c87c404de923950Johann 4842263fc984bdc858ee931d3e35c87c404de923950Johann ; Horizontally accumulate/max eobs and write into [eob] memory pointer 4852263fc984bdc858ee931d3e35c87c404de923950Johann mov r2, eobmp 4862263fc984bdc858ee931d3e35c87c404de923950Johann pshufd m7, m8, 0xe 4872263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 4882263fc984bdc858ee931d3e35c87c404de923950Johann pshuflw m7, m8, 0xe 4892263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 4902263fc984bdc858ee931d3e35c87c404de923950Johann pshuflw m7, m8, 0x1 4912263fc984bdc858ee931d3e35c87c404de923950Johann pmaxsw m8, m7 4922263fc984bdc858ee931d3e35c87c404de923950Johann movq rax, m8 4932263fc984bdc858ee931d3e35c87c404de923950Johann mov [r2], ax 4942263fc984bdc858ee931d3e35c87c404de923950Johann vzeroupper 4952263fc984bdc858ee931d3e35c87c404de923950Johann RET 4962263fc984bdc858ee931d3e35c87c404de923950Johann 4972263fc984bdc858ee931d3e35c87c404de923950Johann ; Skip-block, i.e. just write all zeroes 4982263fc984bdc858ee931d3e35c87c404de923950Johann.blank: 4992263fc984bdc858ee931d3e35c87c404de923950Johann 5002263fc984bdc858ee931d3e35c87c404de923950JohannDEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \ 5012263fc984bdc858ee931d3e35c87c404de923950Johann qcoeff, dqcoeff, dequant, eob, scan, iscan 5022263fc984bdc858ee931d3e35c87c404de923950Johann 5032263fc984bdc858ee931d3e35c87c404de923950Johann mov r0, dqcoeffmp 5042263fc984bdc858ee931d3e35c87c404de923950Johann movifnidn ncoeffq, ncoeffmp 5052263fc984bdc858ee931d3e35c87c404de923950Johann mov r2, qcoeffmp 5062263fc984bdc858ee931d3e35c87c404de923950Johann mov r3, eobmp 5072263fc984bdc858ee931d3e35c87c404de923950Johann 5082263fc984bdc858ee931d3e35c87c404de923950JohannDEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 5092263fc984bdc858ee931d3e35c87c404de923950Johann 5102263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 5112263fc984bdc858ee931d3e35c87c404de923950Johann lea dqcoeffq, [dqcoeffq+ncoeffq*4] 5122263fc984bdc858ee931d3e35c87c404de923950Johann lea qcoeffq, [ qcoeffq+ncoeffq*4] 5132263fc984bdc858ee931d3e35c87c404de923950Johann%else 5142263fc984bdc858ee931d3e35c87c404de923950Johann lea dqcoeffq, [dqcoeffq+ncoeffq*2] 5152263fc984bdc858ee931d3e35c87c404de923950Johann lea qcoeffq, [ qcoeffq+ncoeffq*2] 5162263fc984bdc858ee931d3e35c87c404de923950Johann%endif 5172263fc984bdc858ee931d3e35c87c404de923950Johann 5182263fc984bdc858ee931d3e35c87c404de923950Johann neg ncoeffq 5192263fc984bdc858ee931d3e35c87c404de923950Johann pxor m7, m7 5202263fc984bdc858ee931d3e35c87c404de923950Johann 5212263fc984bdc858ee931d3e35c87c404de923950Johann.blank_loop: 5222263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH 5232263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+ 0], ymm7 5242263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*4+32], ymm7 5252263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+ 0], ymm7 5262263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*4+32], ymm7 5272263fc984bdc858ee931d3e35c87c404de923950Johann%else 5282263fc984bdc858ee931d3e35c87c404de923950Johann mova [dqcoeffq+ncoeffq*2+ 0], ymm7 5292263fc984bdc858ee931d3e35c87c404de923950Johann mova [qcoeffq+ncoeffq*2+ 0], ymm7 5302263fc984bdc858ee931d3e35c87c404de923950Johann%endif 5312263fc984bdc858ee931d3e35c87c404de923950Johann add ncoeffq, mmsize 5322263fc984bdc858ee931d3e35c87c404de923950Johann jl .blank_loop 5332263fc984bdc858ee931d3e35c87c404de923950Johann 5342263fc984bdc858ee931d3e35c87c404de923950Johann mov [eobq], word 0 5352263fc984bdc858ee931d3e35c87c404de923950Johann 5362263fc984bdc858ee931d3e35c87c404de923950Johann vzeroupper 5372263fc984bdc858ee931d3e35c87c404de923950Johann RET 5382263fc984bdc858ee931d3e35c87c404de923950Johann%endmacro 5392263fc984bdc858ee931d3e35c87c404de923950Johann 5402263fc984bdc858ee931d3e35c87c404de923950JohannINIT_XMM avx 5412263fc984bdc858ee931d3e35c87c404de923950JohannQUANTIZE_FN b, 7 5422263fc984bdc858ee931d3e35c87c404de923950JohannQUANTIZE_FN b_32x32, 7 5432263fc984bdc858ee931d3e35c87c404de923950Johann 5442263fc984bdc858ee931d3e35c87c404de923950JohannEND 545