191037db265ecdd914a26e056cf69207b4f50924ehkuang;
291037db265ecdd914a26e056cf69207b4f50924ehkuang;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
391037db265ecdd914a26e056cf69207b4f50924ehkuang;
491037db265ecdd914a26e056cf69207b4f50924ehkuang;  Use of this source code is governed by a BSD-style license
591037db265ecdd914a26e056cf69207b4f50924ehkuang;  that can be found in the LICENSE file in the root of the source
691037db265ecdd914a26e056cf69207b4f50924ehkuang;  tree. An additional intellectual property rights grant can be found
791037db265ecdd914a26e056cf69207b4f50924ehkuang;  in the file PATENTS.  All contributing project authors may
891037db265ecdd914a26e056cf69207b4f50924ehkuang;  be found in the AUTHORS file in the root of the source tree.
991037db265ecdd914a26e056cf69207b4f50924ehkuang;
1091037db265ecdd914a26e056cf69207b4f50924ehkuang
11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian%define private_prefix vp9
12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
1391037db265ecdd914a26e056cf69207b4f50924ehkuang%include "third_party/x86inc/x86inc.asm"
148b92989c89bec8632aa47dc58dc162f199d62edcJames Zern%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
1591037db265ecdd914a26e056cf69207b4f50924ehkuang
1691037db265ecdd914a26e056cf69207b4f50924ehkuangSECTION .text
1791037db265ecdd914a26e056cf69207b4f50924ehkuang
1891037db265ecdd914a26e056cf69207b4f50924ehkuang; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
1991037db265ecdd914a26e056cf69207b4f50924ehkuang;                         int64_t *ssz)
2091037db265ecdd914a26e056cf69207b4f50924ehkuang
2191037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM sse2
2291037db265ecdd914a26e056cf69207b4f50924ehkuangcglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
2391037db265ecdd914a26e056cf69207b4f50924ehkuang  pxor      m4, m4                 ; sse accumulator
2491037db265ecdd914a26e056cf69207b4f50924ehkuang  pxor      m6, m6                 ; ssz accumulator
2591037db265ecdd914a26e056cf69207b4f50924ehkuang  pxor      m5, m5                 ; dedicated zero register
2691037db265ecdd914a26e056cf69207b4f50924ehkuang.loop:
278b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  LOAD_TRAN_LOW 2, uqcq, 0
288b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  LOAD_TRAN_LOW 0, dqcq, 0
298b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  LOAD_TRAN_LOW 3, uqcq, 8
308b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  LOAD_TRAN_LOW 1, dqcq, 8
318b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
328b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
338b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  sub    sizeq, 16
3491037db265ecdd914a26e056cf69207b4f50924ehkuang  psubw     m0, m2
3591037db265ecdd914a26e056cf69207b4f50924ehkuang  psubw     m1, m3
3691037db265ecdd914a26e056cf69207b4f50924ehkuang  ; individual errors are max. 15bit+sign, so squares are 30bit, and
3791037db265ecdd914a26e056cf69207b4f50924ehkuang  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
3891037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddwd   m0, m0
3991037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddwd   m1, m1
4091037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddwd   m2, m2
4191037db265ecdd914a26e056cf69207b4f50924ehkuang  pmaddwd   m3, m3
428b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
438b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  paddd     m0, m1
448b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  paddd     m2, m3
4591037db265ecdd914a26e056cf69207b4f50924ehkuang  ; accumulate in 64bit
4691037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckldq m7, m0, m5
4791037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhdq m0, m5
4891037db265ecdd914a26e056cf69207b4f50924ehkuang  paddq     m4, m7
4991037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckldq m7, m2, m5
508b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  paddq     m4, m0
5191037db265ecdd914a26e056cf69207b4f50924ehkuang  punpckhdq m2, m5
5291037db265ecdd914a26e056cf69207b4f50924ehkuang  paddq     m6, m7
5391037db265ecdd914a26e056cf69207b4f50924ehkuang  paddq     m6, m2
548b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  jg .loop
5591037db265ecdd914a26e056cf69207b4f50924ehkuang
5691037db265ecdd914a26e056cf69207b4f50924ehkuang  ; accumulate horizontally and store in return value
5791037db265ecdd914a26e056cf69207b4f50924ehkuang  movhlps   m5, m4
5891037db265ecdd914a26e056cf69207b4f50924ehkuang  movhlps   m7, m6
5991037db265ecdd914a26e056cf69207b4f50924ehkuang  paddq     m4, m5
6091037db265ecdd914a26e056cf69207b4f50924ehkuang  paddq     m6, m7
6191037db265ecdd914a26e056cf69207b4f50924ehkuang%if ARCH_X86_64
6291037db265ecdd914a26e056cf69207b4f50924ehkuang  movq    rax, m4
6391037db265ecdd914a26e056cf69207b4f50924ehkuang  movq [sszq], m6
6491037db265ecdd914a26e056cf69207b4f50924ehkuang%else
6591037db265ecdd914a26e056cf69207b4f50924ehkuang  mov     eax, sszm
6691037db265ecdd914a26e056cf69207b4f50924ehkuang  pshufd   m5, m4, 0x1
6791037db265ecdd914a26e056cf69207b4f50924ehkuang  movq  [eax], m6
6891037db265ecdd914a26e056cf69207b4f50924ehkuang  movd    eax, m4
6991037db265ecdd914a26e056cf69207b4f50924ehkuang  movd    edx, m5
7091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif
7191037db265ecdd914a26e056cf69207b4f50924ehkuang  RET
72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
738b92989c89bec8632aa47dc58dc162f199d62edcJames Zern; Compute the sum of squared difference between two tran_low_t vectors.
748b92989c89bec8632aa47dc58dc162f199d62edcJames Zern; Vectors are converted (if necessary) to int16_t for calculations.
758b92989c89bec8632aa47dc58dc162f199d62edcJames Zern; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian;                            intptr_t block_size)
77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianINIT_XMM sse2
79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramaniancglobal block_error_fp, 3, 3, 6, uqc, dqc, size
80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  pxor      m4, m4                 ; sse accumulator
81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  pxor      m5, m5                 ; dedicated zero register
82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian.loop:
838b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  LOAD_TRAN_LOW 2, uqcq, 0
848b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  LOAD_TRAN_LOW 0, dqcq, 0
858b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  LOAD_TRAN_LOW 3, uqcq, 8
868b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  LOAD_TRAN_LOW 1, dqcq, 8
878b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
888b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
898b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  sub    sizeq, 16
90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  psubw     m0, m2
91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  psubw     m1, m3
92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ; individual errors are max. 15bit+sign, so squares are 30bit, and
93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  pmaddwd   m0, m0
95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  pmaddwd   m1, m1
968b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
978b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  paddd     m0, m1
98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ; accumulate in 64bit
99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  punpckldq m3, m0, m5
100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  punpckhdq m0, m5
101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  paddq     m4, m3
102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  paddq     m4, m0
1038b92989c89bec8632aa47dc58dc162f199d62edcJames Zern  jnz .loop
104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  ; accumulate horizontally and store in return value
106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  movhlps   m5, m4
107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  paddq     m4, m5
108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian%if ARCH_X86_64
109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  movq    rax, m4
110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian%else
111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  pshufd   m5, m4, 0x1
112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  movd    eax, m4
113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  movd    edx, m5
114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian%endif
115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  RET
116