12263fc984bdc858ee931d3e35c87c404de923950Johann;
22263fc984bdc858ee931d3e35c87c404de923950Johann;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
32263fc984bdc858ee931d3e35c87c404de923950Johann;
42263fc984bdc858ee931d3e35c87c404de923950Johann;  Use of this source code is governed by a BSD-style license
52263fc984bdc858ee931d3e35c87c404de923950Johann;  that can be found in the LICENSE file in the root of the source
62263fc984bdc858ee931d3e35c87c404de923950Johann;  tree. An additional intellectual property rights grant can be found
72263fc984bdc858ee931d3e35c87c404de923950Johann;  in the file PATENTS.  All contributing project authors may
82263fc984bdc858ee931d3e35c87c404de923950Johann;  be found in the AUTHORS file in the root of the source tree.
92263fc984bdc858ee931d3e35c87c404de923950Johann;
102263fc984bdc858ee931d3e35c87c404de923950Johann
112263fc984bdc858ee931d3e35c87c404de923950Johann%include "third_party/x86inc/x86inc.asm"
122263fc984bdc858ee931d3e35c87c404de923950Johann
132263fc984bdc858ee931d3e35c87c404de923950JohannSECTION .text
142263fc984bdc858ee931d3e35c87c404de923950Johann
152263fc984bdc858ee931d3e35c87c404de923950Johann%macro QUANTIZE_FN 2
162263fc984bdc858ee931d3e35c87c404de923950Johanncglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
172263fc984bdc858ee931d3e35c87c404de923950Johann                                shift, qcoeff, dqcoeff, dequant, \
182263fc984bdc858ee931d3e35c87c404de923950Johann                                eob, scan, iscan
192263fc984bdc858ee931d3e35c87c404de923950Johann
202263fc984bdc858ee931d3e35c87c404de923950Johann  vzeroupper
212263fc984bdc858ee931d3e35c87c404de923950Johann
222263fc984bdc858ee931d3e35c87c404de923950Johann  ; If we can skip this block, then just zero the output
232263fc984bdc858ee931d3e35c87c404de923950Johann  cmp                         skipmp, 0
242263fc984bdc858ee931d3e35c87c404de923950Johann  jne .blank
252263fc984bdc858ee931d3e35c87c404de923950Johann
262263fc984bdc858ee931d3e35c87c404de923950Johann%ifnidn %1, b_32x32
272263fc984bdc858ee931d3e35c87c404de923950Johann
282263fc984bdc858ee931d3e35c87c404de923950Johann  ; Special case for ncoeff == 16, as it is frequent and we can save on
292263fc984bdc858ee931d3e35c87c404de923950Johann  ; not setting up a loop.
302263fc984bdc858ee931d3e35c87c404de923950Johann  cmp                       ncoeffmp, 16
312263fc984bdc858ee931d3e35c87c404de923950Johann  jne .generic
322263fc984bdc858ee931d3e35c87c404de923950Johann
332263fc984bdc858ee931d3e35c87c404de923950Johann  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
342263fc984bdc858ee931d3e35c87c404de923950Johann  ;; Special case of ncoeff == 16
352263fc984bdc858ee931d3e35c87c404de923950Johann  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
362263fc984bdc858ee931d3e35c87c404de923950Johann
372263fc984bdc858ee931d3e35c87c404de923950Johann.single:
382263fc984bdc858ee931d3e35c87c404de923950Johann
392263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                   coeffq, coeffmp
402263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                    zbinq, zbinmp
412263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m0, [zbinq]              ; m0 = zbin
422263fc984bdc858ee931d3e35c87c404de923950Johann
432263fc984bdc858ee931d3e35c87c404de923950Johann  ; Get DC and first 15 AC coeffs - in this special case, that is all.
442263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
452263fc984bdc858ee931d3e35c87c404de923950Johann  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
462263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m9, [coeffq]
472263fc984bdc858ee931d3e35c87c404de923950Johann  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
482263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m10, [coeffq+32]
492263fc984bdc858ee931d3e35c87c404de923950Johann  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
502263fc984bdc858ee931d3e35c87c404de923950Johann%else
512263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m9, [coeffq]             ; m9 = c[i]
522263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m10, [coeffq+16]          ; m10 = c[i]
532263fc984bdc858ee931d3e35c87c404de923950Johann%endif
542263fc984bdc858ee931d3e35c87c404de923950Johann
552263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r0, eobmp                ; Output pointer
562263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r1, qcoeffmp             ; Output pointer
572263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r2, dqcoeffmp            ; Output pointer
582263fc984bdc858ee931d3e35c87c404de923950Johann
592263fc984bdc858ee931d3e35c87c404de923950Johann  pxor                            m5, m5                   ; m5 = dedicated zero
602263fc984bdc858ee931d3e35c87c404de923950Johann
612263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpeqw                         m4, m4                   ; All word lanes -1
622263fc984bdc858ee931d3e35c87c404de923950Johann  paddw                           m0, m4                   ; m0 = zbin - 1
632263fc984bdc858ee931d3e35c87c404de923950Johann
642263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                           m6, m9                   ; m6 = abs(m9)
652263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                          m11, m10                  ; m11 = abs(m10)
662263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
672263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m0, m0
682263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
692263fc984bdc858ee931d3e35c87c404de923950Johann
702263fc984bdc858ee931d3e35c87c404de923950Johann  ; Check if all coeffs are less than zbin. If yes, we just write zeros
712263fc984bdc858ee931d3e35c87c404de923950Johann  ; to the outputs and we are done.
722263fc984bdc858ee931d3e35c87c404de923950Johann  por                            m14, m7, m12
732263fc984bdc858ee931d3e35c87c404de923950Johann  ptest                          m14, m14
742263fc984bdc858ee931d3e35c87c404de923950Johann  jnz .single_nonzero
752263fc984bdc858ee931d3e35c87c404de923950Johann
762263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
772263fc984bdc858ee931d3e35c87c404de923950Johann  mova                       [r1   ], ymm5
782263fc984bdc858ee931d3e35c87c404de923950Johann  mova                       [r1+32], ymm5
792263fc984bdc858ee931d3e35c87c404de923950Johann  mova                       [r2   ], ymm5
802263fc984bdc858ee931d3e35c87c404de923950Johann  mova                       [r2+32], ymm5
812263fc984bdc858ee931d3e35c87c404de923950Johann%else
822263fc984bdc858ee931d3e35c87c404de923950Johann  mova                          [r1], ymm5
832263fc984bdc858ee931d3e35c87c404de923950Johann  mova                          [r2], ymm5
842263fc984bdc858ee931d3e35c87c404de923950Johann%endif
852263fc984bdc858ee931d3e35c87c404de923950Johann  mov                           [r0], word 0
862263fc984bdc858ee931d3e35c87c404de923950Johann
872263fc984bdc858ee931d3e35c87c404de923950Johann  vzeroupper
882263fc984bdc858ee931d3e35c87c404de923950Johann  RET
892263fc984bdc858ee931d3e35c87c404de923950Johann
902263fc984bdc858ee931d3e35c87c404de923950Johann.single_nonzero:
912263fc984bdc858ee931d3e35c87c404de923950Johann
922263fc984bdc858ee931d3e35c87c404de923950Johann  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
932263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                       r4, roundmp
942263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                       r5, quantmp
952263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r3, dequantmp
962263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r6, shiftmp
972263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m1, [r4]              ; m1 = round
982263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m2, [r5]              ; m2 = quant
992263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m3, [r3]              ; m3 = dequant
1002263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m4, [r6]              ; m4 = shift
1012263fc984bdc858ee931d3e35c87c404de923950Johann
1022263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r3, iscanmp
1032263fc984bdc858ee931d3e35c87c404de923950Johann
1042263fc984bdc858ee931d3e35c87c404de923950Johann  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
1052263fc984bdc858ee931d3e35c87c404de923950Johann
1062263fc984bdc858ee931d3e35c87c404de923950Johann  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1072263fc984bdc858ee931d3e35c87c404de923950Johann
1082263fc984bdc858ee931d3e35c87c404de923950Johann  paddsw                          m6, m1                   ; m6 += round
1092263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m1, m1
1102263fc984bdc858ee931d3e35c87c404de923950Johann  paddsw                         m11, m1                   ; m11 += round
1112263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
1122263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m2, m2
1132263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
1142263fc984bdc858ee931d3e35c87c404de923950Johann  paddw                           m8, m6                   ; m8 += m6
1152263fc984bdc858ee931d3e35c87c404de923950Johann  paddw                          m13, m11                  ; m13 += m11
1162263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
1172263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m4, m4
1182263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
1192263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                          m8, m9                   ; m8 = reinsert sign
1202263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                         m13, m10                  ; m13 = reinsert sign
1212263fc984bdc858ee931d3e35c87c404de923950Johann  pand                            m8, m7
1222263fc984bdc858ee931d3e35c87c404de923950Johann  pand                           m13, m12
1232263fc984bdc858ee931d3e35c87c404de923950Johann
1242263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
1252263fc984bdc858ee931d3e35c87c404de923950Johann  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
1262263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m8
1272263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m8, m6
1282263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m8
1292263fc984bdc858ee931d3e35c87c404de923950Johann  mova                  [qcoeffq   ], m11
1302263fc984bdc858ee931d3e35c87c404de923950Johann  mova                  [qcoeffq+16], m6
1312263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m13
1322263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m13, m6
1332263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m13
1342263fc984bdc858ee931d3e35c87c404de923950Johann  mova                  [qcoeffq+32], m11
1352263fc984bdc858ee931d3e35c87c404de923950Johann  mova                  [qcoeffq+48], m6
1362263fc984bdc858ee931d3e35c87c404de923950Johann%else
1372263fc984bdc858ee931d3e35c87c404de923950Johann  mova                  [qcoeffq   ], m8
1382263fc984bdc858ee931d3e35c87c404de923950Johann  mova                  [qcoeffq+16], m13
1392263fc984bdc858ee931d3e35c87c404de923950Johann%endif
1402263fc984bdc858ee931d3e35c87c404de923950Johann
1412263fc984bdc858ee931d3e35c87c404de923950Johann  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
1422263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m3, m3
1432263fc984bdc858ee931d3e35c87c404de923950Johann  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
1442263fc984bdc858ee931d3e35c87c404de923950Johann
1452263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
1462263fc984bdc858ee931d3e35c87c404de923950Johann  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
1472263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m8
1482263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m8, m6
1492263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m8
1502263fc984bdc858ee931d3e35c87c404de923950Johann  mova                 [dqcoeffq   ], m11
1512263fc984bdc858ee931d3e35c87c404de923950Johann  mova                 [dqcoeffq+16], m6
1522263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m13
1532263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m13, m6
1542263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m13
1552263fc984bdc858ee931d3e35c87c404de923950Johann  mova                 [dqcoeffq+32], m11
1562263fc984bdc858ee931d3e35c87c404de923950Johann  mova                 [dqcoeffq+48], m6
1572263fc984bdc858ee931d3e35c87c404de923950Johann%else
1582263fc984bdc858ee931d3e35c87c404de923950Johann  mova                 [dqcoeffq   ], m8
1592263fc984bdc858ee931d3e35c87c404de923950Johann  mova                 [dqcoeffq+16], m13
1602263fc984bdc858ee931d3e35c87c404de923950Johann%endif
1612263fc984bdc858ee931d3e35c87c404de923950Johann
1622263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m6, [iscanq]            ; m6 = scan[i]
1632263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m11, [iscanq+16]         ; m11 = scan[i]
1642263fc984bdc858ee931d3e35c87c404de923950Johann
1652263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
1662263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
1672263fc984bdc858ee931d3e35c87c404de923950Johann  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
1682263fc984bdc858ee931d3e35c87c404de923950Johann  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
1692263fc984bdc858ee931d3e35c87c404de923950Johann  pandn                           m8,  m8,  m6            ; m8 = max(eob)
1702263fc984bdc858ee931d3e35c87c404de923950Johann  pandn                          m13, m13, m11            ; m13 = max(eob)
1712263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8,  m8, m13
1722263fc984bdc858ee931d3e35c87c404de923950Johann
1732263fc984bdc858ee931d3e35c87c404de923950Johann  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
1742263fc984bdc858ee931d3e35c87c404de923950Johann  pshufd                          m7, m8, 0xe
1752263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
1762263fc984bdc858ee931d3e35c87c404de923950Johann  pshuflw                         m7, m8, 0xe
1772263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
1782263fc984bdc858ee931d3e35c87c404de923950Johann  pshuflw                         m7, m8, 0x1
1792263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
1802263fc984bdc858ee931d3e35c87c404de923950Johann  movq                           rax, m8
1812263fc984bdc858ee931d3e35c87c404de923950Johann  mov                         [eobq], ax
1822263fc984bdc858ee931d3e35c87c404de923950Johann
1832263fc984bdc858ee931d3e35c87c404de923950Johann  vzeroupper
1842263fc984bdc858ee931d3e35c87c404de923950Johann  RET
1852263fc984bdc858ee931d3e35c87c404de923950Johann
1862263fc984bdc858ee931d3e35c87c404de923950Johann  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1872263fc984bdc858ee931d3e35c87c404de923950Johann  ;; Generic case of ncoeff != 16
1882263fc984bdc858ee931d3e35c87c404de923950Johann  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1892263fc984bdc858ee931d3e35c87c404de923950Johann
1902263fc984bdc858ee931d3e35c87c404de923950Johann.generic:
1912263fc984bdc858ee931d3e35c87c404de923950Johann
1922263fc984bdc858ee931d3e35c87c404de923950Johann%endif ; %ifnidn %1, b_32x32
1932263fc984bdc858ee931d3e35c87c404de923950Johann
1942263fc984bdc858ee931d3e35c87c404de923950JohannDEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
1952263fc984bdc858ee931d3e35c87c404de923950Johann            qcoeff, dqcoeff, dequant, eob, scan, iscan
1962263fc984bdc858ee931d3e35c87c404de923950Johann
1972263fc984bdc858ee931d3e35c87c404de923950Johann  ; Actual quantization loop - setup pointers, rounders, etc.
1982263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                   coeffq, coeffmp
1992263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                  ncoeffq, ncoeffmp
2002263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r2, dequantmp
2012263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                    zbinq, zbinmp
2022263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                   roundq, roundmp
2032263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                   quantq, quantmp
2042263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m0, [zbinq]              ; m0 = zbin
2052263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m1, [roundq]             ; m1 = round
2062263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m2, [quantq]             ; m2 = quant
2072263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m3, [r2]                 ; m3 = dequant
2082263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpeqw                         m4, m4                   ; All lanes -1
2092263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32
2102263fc984bdc858ee931d3e35c87c404de923950Johann  psubw                           m0, m4
2112263fc984bdc858ee931d3e35c87c404de923950Johann  psubw                           m1, m4
2122263fc984bdc858ee931d3e35c87c404de923950Johann  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
2132263fc984bdc858ee931d3e35c87c404de923950Johann  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
2142263fc984bdc858ee931d3e35c87c404de923950Johann%endif
2152263fc984bdc858ee931d3e35c87c404de923950Johann  paddw                           m0, m4                   ; m0 = m0 + 1
2162263fc984bdc858ee931d3e35c87c404de923950Johann
2172263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r2, shiftmp
2182263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r3, qcoeffmp
2192263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m4, [r2]                 ; m4 = shift
2202263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r4, dqcoeffmp
2212263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r5, iscanmp
2222263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32
2232263fc984bdc858ee931d3e35c87c404de923950Johann  psllw                           m4, 1
2242263fc984bdc858ee931d3e35c87c404de923950Johann%endif
2252263fc984bdc858ee931d3e35c87c404de923950Johann  pxor                            m5, m5                   ; m5 = dedicated zero
2262263fc984bdc858ee931d3e35c87c404de923950Johann
2272263fc984bdc858ee931d3e35c87c404de923950Johann  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
2282263fc984bdc858ee931d3e35c87c404de923950Johann
2292263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
2302263fc984bdc858ee931d3e35c87c404de923950Johann  lea                         coeffq, [  coeffq+ncoeffq*4]
2312263fc984bdc858ee931d3e35c87c404de923950Johann  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
2322263fc984bdc858ee931d3e35c87c404de923950Johann  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
2332263fc984bdc858ee931d3e35c87c404de923950Johann%else
2342263fc984bdc858ee931d3e35c87c404de923950Johann  lea                         coeffq, [  coeffq+ncoeffq*2]
2352263fc984bdc858ee931d3e35c87c404de923950Johann  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
2362263fc984bdc858ee931d3e35c87c404de923950Johann  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
2372263fc984bdc858ee931d3e35c87c404de923950Johann%endif
2382263fc984bdc858ee931d3e35c87c404de923950Johann  lea                         iscanq, [  iscanq+ncoeffq*2]
2392263fc984bdc858ee931d3e35c87c404de923950Johann  neg                        ncoeffq
2402263fc984bdc858ee931d3e35c87c404de923950Johann
2412263fc984bdc858ee931d3e35c87c404de923950Johann  ; get DC and first 15 AC coeffs
2422263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
2432263fc984bdc858ee931d3e35c87c404de923950Johann  ; coeff stored as 32bit numbers & require 16bit numbers
2442263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m9, [coeffq+ncoeffq*4+ 0]
2452263fc984bdc858ee931d3e35c87c404de923950Johann  packssdw                        m9, [coeffq+ncoeffq*4+16]
2462263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m10, [coeffq+ncoeffq*4+32]
2472263fc984bdc858ee931d3e35c87c404de923950Johann  packssdw                       m10, [coeffq+ncoeffq*4+48]
2482263fc984bdc858ee931d3e35c87c404de923950Johann%else
2492263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
2502263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
2512263fc984bdc858ee931d3e35c87c404de923950Johann%endif
2522263fc984bdc858ee931d3e35c87c404de923950Johann
2532263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                           m6, m9                   ; m6 = abs(m9)
2542263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                          m11, m10                  ; m11 = abs(m10)
2552263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
2562263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m0, m0
2572263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
2582263fc984bdc858ee931d3e35c87c404de923950Johann
2592263fc984bdc858ee931d3e35c87c404de923950Johann  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
2602263fc984bdc858ee931d3e35c87c404de923950Johann  por                            m14, m7, m12
2612263fc984bdc858ee931d3e35c87c404de923950Johann  ptest                          m14, m14
2622263fc984bdc858ee931d3e35c87c404de923950Johann  jnz .first_nonzero
2632263fc984bdc858ee931d3e35c87c404de923950Johann
2642263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
2652263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4   ], ymm5
2662263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+32], ymm5
2672263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4   ], ymm5
2682263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+32], ymm5
2692263fc984bdc858ee931d3e35c87c404de923950Johann%else
2702263fc984bdc858ee931d3e35c87c404de923950Johann  mova           [qcoeffq+ncoeffq*2], ymm5
2712263fc984bdc858ee931d3e35c87c404de923950Johann  mova          [dqcoeffq+ncoeffq*2], ymm5
2722263fc984bdc858ee931d3e35c87c404de923950Johann%endif
2732263fc984bdc858ee931d3e35c87c404de923950Johann
2742263fc984bdc858ee931d3e35c87c404de923950Johann  add                        ncoeffq, mmsize
2752263fc984bdc858ee931d3e35c87c404de923950Johann
2762263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m1, m1
2772263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m2, m2
2782263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m3, m3
2792263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m4, m4
2802263fc984bdc858ee931d3e35c87c404de923950Johann  pxor                            m8, m8
2812263fc984bdc858ee931d3e35c87c404de923950Johann
2822263fc984bdc858ee931d3e35c87c404de923950Johann  jmp .ac_only_loop
2832263fc984bdc858ee931d3e35c87c404de923950Johann
2842263fc984bdc858ee931d3e35c87c404de923950Johann.first_nonzero:
2852263fc984bdc858ee931d3e35c87c404de923950Johann
2862263fc984bdc858ee931d3e35c87c404de923950Johann  paddsw                          m6, m1                   ; m6 += round
2872263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m1, m1
2882263fc984bdc858ee931d3e35c87c404de923950Johann  paddsw                         m11, m1                   ; m11 += round
2892263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
2902263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m2, m2
2912263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
2922263fc984bdc858ee931d3e35c87c404de923950Johann  paddw                           m8, m6                   ; m8 += m6
2932263fc984bdc858ee931d3e35c87c404de923950Johann  paddw                          m13, m11                  ; m13 += m11
2942263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
2952263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m4, m4
2962263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
2972263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                          m8, m9                   ; m8 = reinsert sign
2982263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                         m13, m10                  ; m13 = reinsert sign
2992263fc984bdc858ee931d3e35c87c404de923950Johann  pand                            m8, m7
3002263fc984bdc858ee931d3e35c87c404de923950Johann  pand                           m13, m12
3012263fc984bdc858ee931d3e35c87c404de923950Johann
3022263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
3032263fc984bdc858ee931d3e35c87c404de923950Johann  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
3042263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m8
3052263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m8, m6
3062263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m8
3072263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+ 0], m11
3082263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+16], m6
3092263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m13
3102263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m13, m6
3112263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m13
3122263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+32], m11
3132263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+48], m6
3142263fc984bdc858ee931d3e35c87c404de923950Johann%else
3152263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*2+ 0], m8
3162263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*2+16], m13
3172263fc984bdc858ee931d3e35c87c404de923950Johann%endif
3182263fc984bdc858ee931d3e35c87c404de923950Johann
3192263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32
3202263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                           m8, m8
3212263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                          m13, m13
3222263fc984bdc858ee931d3e35c87c404de923950Johann%endif
3232263fc984bdc858ee931d3e35c87c404de923950Johann  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
3242263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhqdq                      m3, m3
3252263fc984bdc858ee931d3e35c87c404de923950Johann  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
3262263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32
3272263fc984bdc858ee931d3e35c87c404de923950Johann  psrlw                           m8, 1
3282263fc984bdc858ee931d3e35c87c404de923950Johann  psrlw                          m13, 1
3292263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                          m8, m9
3302263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                         m13, m10
3312263fc984bdc858ee931d3e35c87c404de923950Johann%endif
3322263fc984bdc858ee931d3e35c87c404de923950Johann
3332263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
3342263fc984bdc858ee931d3e35c87c404de923950Johann  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
3352263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m8
3362263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m8, m6
3372263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m8
3382263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+ 0], m11
3392263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+16], m6
3402263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m13
3412263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m13, m6
3422263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m13
3432263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+32], m11
3442263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+48], m6
3452263fc984bdc858ee931d3e35c87c404de923950Johann%else
3462263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*2+ 0], m8
3472263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*2+16], m13
3482263fc984bdc858ee931d3e35c87c404de923950Johann%endif
3492263fc984bdc858ee931d3e35c87c404de923950Johann
3502263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
3512263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
3522263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
3532263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
3542263fc984bdc858ee931d3e35c87c404de923950Johann  psubw                           m6, m7                    ; m6 = scan[i] + 1
3552263fc984bdc858ee931d3e35c87c404de923950Johann  psubw                          m11, m12                   ; m11 = scan[i] + 1
3562263fc984bdc858ee931d3e35c87c404de923950Johann  pandn                           m8, m6                    ; m8 = max(eob)
3572263fc984bdc858ee931d3e35c87c404de923950Johann  pandn                          m13, m11                   ; m13 = max(eob)
3582263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m13
3592263fc984bdc858ee931d3e35c87c404de923950Johann  add                        ncoeffq, mmsize
3602263fc984bdc858ee931d3e35c87c404de923950Johann
3612263fc984bdc858ee931d3e35c87c404de923950Johann.ac_only_loop:
3622263fc984bdc858ee931d3e35c87c404de923950Johann
3632263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
3642263fc984bdc858ee931d3e35c87c404de923950Johann  ; pack coeff from 32bit to 16bit array
3652263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m9, [coeffq+ncoeffq*4+ 0]
3662263fc984bdc858ee931d3e35c87c404de923950Johann  packssdw                        m9, [coeffq+ncoeffq*4+16]
3672263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m10, [coeffq+ncoeffq*4+32]
3682263fc984bdc858ee931d3e35c87c404de923950Johann  packssdw                       m10, [coeffq+ncoeffq*4+48]
3692263fc984bdc858ee931d3e35c87c404de923950Johann%else
3702263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
3712263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
3722263fc984bdc858ee931d3e35c87c404de923950Johann%endif
3732263fc984bdc858ee931d3e35c87c404de923950Johann
3742263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                           m6, m9                   ; m6 = abs(m9)
3752263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                          m11, m10                  ; m11 = abs(m10)
3762263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
3772263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
3782263fc984bdc858ee931d3e35c87c404de923950Johann
3792263fc984bdc858ee931d3e35c87c404de923950Johann  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
3802263fc984bdc858ee931d3e35c87c404de923950Johann  ; And just write zeros as the result would be.
3812263fc984bdc858ee931d3e35c87c404de923950Johann  por                            m14, m7, m12
3822263fc984bdc858ee931d3e35c87c404de923950Johann  ptest                          m14, m14
3832263fc984bdc858ee931d3e35c87c404de923950Johann  jnz .rest_nonzero
3842263fc984bdc858ee931d3e35c87c404de923950Johann
3852263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
3862263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
3872263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+32], ymm5
3882263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
3892263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+32], ymm5
3902263fc984bdc858ee931d3e35c87c404de923950Johann%else
3912263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
3922263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
3932263fc984bdc858ee931d3e35c87c404de923950Johann%endif
3942263fc984bdc858ee931d3e35c87c404de923950Johann  add                        ncoeffq, mmsize
3952263fc984bdc858ee931d3e35c87c404de923950Johann  jnz .ac_only_loop
3962263fc984bdc858ee931d3e35c87c404de923950Johann
3972263fc984bdc858ee931d3e35c87c404de923950Johann  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
3982263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r2, eobmp
3992263fc984bdc858ee931d3e35c87c404de923950Johann  pshufd                          m7, m8, 0xe
4002263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
4012263fc984bdc858ee931d3e35c87c404de923950Johann  pshuflw                         m7, m8, 0xe
4022263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
4032263fc984bdc858ee931d3e35c87c404de923950Johann  pshuflw                         m7, m8, 0x1
4042263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
4052263fc984bdc858ee931d3e35c87c404de923950Johann  movq                           rax, m8
4062263fc984bdc858ee931d3e35c87c404de923950Johann  mov                           [r2], ax
4072263fc984bdc858ee931d3e35c87c404de923950Johann  vzeroupper
4082263fc984bdc858ee931d3e35c87c404de923950Johann  RET
4092263fc984bdc858ee931d3e35c87c404de923950Johann
4102263fc984bdc858ee931d3e35c87c404de923950Johann.rest_nonzero:
4112263fc984bdc858ee931d3e35c87c404de923950Johann  paddsw                          m6, m1                   ; m6 += round
4122263fc984bdc858ee931d3e35c87c404de923950Johann  paddsw                         m11, m1                   ; m11 += round
4132263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
4142263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
4152263fc984bdc858ee931d3e35c87c404de923950Johann  paddw                          m14, m6                   ; m14 += m6
4162263fc984bdc858ee931d3e35c87c404de923950Johann  paddw                          m13, m11                  ; m13 += m11
4172263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
4182263fc984bdc858ee931d3e35c87c404de923950Johann  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
4192263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                         m14, m9                   ; m14 = reinsert sign
4202263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                         m13, m10                  ; m13 = reinsert sign
4212263fc984bdc858ee931d3e35c87c404de923950Johann  pand                           m14, m7
4222263fc984bdc858ee931d3e35c87c404de923950Johann  pand                           m13, m12
4232263fc984bdc858ee931d3e35c87c404de923950Johann
4242263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
4252263fc984bdc858ee931d3e35c87c404de923950Johann  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
4262263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m14
4272263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m14, m6
4282263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m14
4292263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+ 0], m11
4302263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+16], m6
4312263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m13
4322263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m13, m6
4332263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m13
4342263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+32], m11
4352263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+48], m6
4362263fc984bdc858ee931d3e35c87c404de923950Johann%else
4372263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*2+ 0], m14
4382263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*2+16], m13
4392263fc984bdc858ee931d3e35c87c404de923950Johann%endif
4402263fc984bdc858ee931d3e35c87c404de923950Johann
4412263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32
4422263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                          m14, m14
4432263fc984bdc858ee931d3e35c87c404de923950Johann  pabsw                          m13, m13
4442263fc984bdc858ee931d3e35c87c404de923950Johann%endif
4452263fc984bdc858ee931d3e35c87c404de923950Johann  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
4462263fc984bdc858ee931d3e35c87c404de923950Johann  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
4472263fc984bdc858ee931d3e35c87c404de923950Johann%ifidn %1, b_32x32
4482263fc984bdc858ee931d3e35c87c404de923950Johann  psrlw                          m14, 1
4492263fc984bdc858ee931d3e35c87c404de923950Johann  psrlw                          m13, 1
4502263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                         m14, m9
4512263fc984bdc858ee931d3e35c87c404de923950Johann  psignw                         m13, m10
4522263fc984bdc858ee931d3e35c87c404de923950Johann%endif
4532263fc984bdc858ee931d3e35c87c404de923950Johann
4542263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
4552263fc984bdc858ee931d3e35c87c404de923950Johann  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
4562263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m14
4572263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m14, m6
4582263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m14
4592263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+ 0], m11
4602263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+16], m6
4612263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpgtw                         m6, m5, m13
4622263fc984bdc858ee931d3e35c87c404de923950Johann  punpckhwd                       m6, m13, m6
4632263fc984bdc858ee931d3e35c87c404de923950Johann  pmovsxwd                       m11, m13
4642263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+32], m11
4652263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+48], m6
4662263fc984bdc858ee931d3e35c87c404de923950Johann%else
4672263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*2+ 0], m14
4682263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*2+16], m13
4692263fc984bdc858ee931d3e35c87c404de923950Johann%endif
4702263fc984bdc858ee931d3e35c87c404de923950Johann
4712263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
4722263fc984bdc858ee931d3e35c87c404de923950Johann  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
4732263fc984bdc858ee931d3e35c87c404de923950Johann  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
4742263fc984bdc858ee931d3e35c87c404de923950Johann  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
4752263fc984bdc858ee931d3e35c87c404de923950Johann  psubw                           m6, m7                    ; m6 = scan[i] + 1
4762263fc984bdc858ee931d3e35c87c404de923950Johann  psubw                          m11, m12                   ; m11 = scan[i] + 1
4772263fc984bdc858ee931d3e35c87c404de923950Johann  pandn                          m14, m6                    ; m14 = max(eob)
4782263fc984bdc858ee931d3e35c87c404de923950Johann  pandn                          m13, m11                   ; m13 = max(eob)
4792263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m14
4802263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m13
4812263fc984bdc858ee931d3e35c87c404de923950Johann  add                        ncoeffq, mmsize
4822263fc984bdc858ee931d3e35c87c404de923950Johann  jnz .ac_only_loop
4832263fc984bdc858ee931d3e35c87c404de923950Johann
4842263fc984bdc858ee931d3e35c87c404de923950Johann  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
4852263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r2, eobmp
4862263fc984bdc858ee931d3e35c87c404de923950Johann  pshufd                          m7, m8, 0xe
4872263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
4882263fc984bdc858ee931d3e35c87c404de923950Johann  pshuflw                         m7, m8, 0xe
4892263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
4902263fc984bdc858ee931d3e35c87c404de923950Johann  pshuflw                         m7, m8, 0x1
4912263fc984bdc858ee931d3e35c87c404de923950Johann  pmaxsw                          m8, m7
4922263fc984bdc858ee931d3e35c87c404de923950Johann  movq                           rax, m8
4932263fc984bdc858ee931d3e35c87c404de923950Johann  mov                           [r2], ax
4942263fc984bdc858ee931d3e35c87c404de923950Johann  vzeroupper
4952263fc984bdc858ee931d3e35c87c404de923950Johann  RET
4962263fc984bdc858ee931d3e35c87c404de923950Johann
4972263fc984bdc858ee931d3e35c87c404de923950Johann  ; Skip-block, i.e. just write all zeroes
4982263fc984bdc858ee931d3e35c87c404de923950Johann.blank:
4992263fc984bdc858ee931d3e35c87c404de923950Johann
5002263fc984bdc858ee931d3e35c87c404de923950JohannDEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
5012263fc984bdc858ee931d3e35c87c404de923950Johann            qcoeff, dqcoeff, dequant, eob, scan, iscan
5022263fc984bdc858ee931d3e35c87c404de923950Johann
5032263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r0, dqcoeffmp
5042263fc984bdc858ee931d3e35c87c404de923950Johann  movifnidn                  ncoeffq, ncoeffmp
5052263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r2, qcoeffmp
5062263fc984bdc858ee931d3e35c87c404de923950Johann  mov                             r3, eobmp
5072263fc984bdc858ee931d3e35c87c404de923950Johann
5082263fc984bdc858ee931d3e35c87c404de923950JohannDEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
5092263fc984bdc858ee931d3e35c87c404de923950Johann
5102263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
5112263fc984bdc858ee931d3e35c87c404de923950Johann  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
5122263fc984bdc858ee931d3e35c87c404de923950Johann  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
5132263fc984bdc858ee931d3e35c87c404de923950Johann%else
5142263fc984bdc858ee931d3e35c87c404de923950Johann  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
5152263fc984bdc858ee931d3e35c87c404de923950Johann  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
5162263fc984bdc858ee931d3e35c87c404de923950Johann%endif
5172263fc984bdc858ee931d3e35c87c404de923950Johann
5182263fc984bdc858ee931d3e35c87c404de923950Johann  neg                        ncoeffq
5192263fc984bdc858ee931d3e35c87c404de923950Johann  pxor                            m7, m7
5202263fc984bdc858ee931d3e35c87c404de923950Johann
5212263fc984bdc858ee931d3e35c87c404de923950Johann.blank_loop:
5222263fc984bdc858ee931d3e35c87c404de923950Johann%if CONFIG_VP9_HIGHBITDEPTH
5232263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
5242263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*4+32], ymm7
5252263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
5262263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*4+32], ymm7
5272263fc984bdc858ee931d3e35c87c404de923950Johann%else
5282263fc984bdc858ee931d3e35c87c404de923950Johann  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
5292263fc984bdc858ee931d3e35c87c404de923950Johann  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
5302263fc984bdc858ee931d3e35c87c404de923950Johann%endif
5312263fc984bdc858ee931d3e35c87c404de923950Johann  add                        ncoeffq, mmsize
5322263fc984bdc858ee931d3e35c87c404de923950Johann  jl .blank_loop
5332263fc984bdc858ee931d3e35c87c404de923950Johann
5342263fc984bdc858ee931d3e35c87c404de923950Johann  mov                         [eobq], word 0
5352263fc984bdc858ee931d3e35c87c404de923950Johann
5362263fc984bdc858ee931d3e35c87c404de923950Johann  vzeroupper
5372263fc984bdc858ee931d3e35c87c404de923950Johann  RET
5382263fc984bdc858ee931d3e35c87c404de923950Johann%endmacro
5392263fc984bdc858ee931d3e35c87c404de923950Johann
5402263fc984bdc858ee931d3e35c87c404de923950JohannINIT_XMM avx
5412263fc984bdc858ee931d3e35c87c404de923950JohannQUANTIZE_FN b, 7
5422263fc984bdc858ee931d3e35c87c404de923950JohannQUANTIZE_FN b_32x32, 7
5432263fc984bdc858ee931d3e35c87c404de923950Johann
5442263fc984bdc858ee931d3e35c87c404de923950JohannEND
545