191037db265ecdd914a26e056cf69207b4f50924ehkuang; 291037db265ecdd914a26e056cf69207b4f50924ehkuang; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 391037db265ecdd914a26e056cf69207b4f50924ehkuang; 491037db265ecdd914a26e056cf69207b4f50924ehkuang; Use of this source code is governed by a BSD-style license 591037db265ecdd914a26e056cf69207b4f50924ehkuang; that can be found in the LICENSE file in the root of the source 691037db265ecdd914a26e056cf69207b4f50924ehkuang; tree. An additional intellectual property rights grant can be found 791037db265ecdd914a26e056cf69207b4f50924ehkuang; in the file PATENTS. All contributing project authors may 891037db265ecdd914a26e056cf69207b4f50924ehkuang; be found in the AUTHORS file in the root of the source tree. 991037db265ecdd914a26e056cf69207b4f50924ehkuang; 1091037db265ecdd914a26e056cf69207b4f50924ehkuang 11da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian%define private_prefix vp9 12da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 1391037db265ecdd914a26e056cf69207b4f50924ehkuang%include "third_party/x86inc/x86inc.asm" 148b92989c89bec8632aa47dc58dc162f199d62edcJames Zern%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" 1591037db265ecdd914a26e056cf69207b4f50924ehkuang 1691037db265ecdd914a26e056cf69207b4f50924ehkuangSECTION .text 1791037db265ecdd914a26e056cf69207b4f50924ehkuang 1891037db265ecdd914a26e056cf69207b4f50924ehkuang; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, 1991037db265ecdd914a26e056cf69207b4f50924ehkuang; int64_t *ssz) 2091037db265ecdd914a26e056cf69207b4f50924ehkuang 2191037db265ecdd914a26e056cf69207b4f50924ehkuangINIT_XMM sse2 2291037db265ecdd914a26e056cf69207b4f50924ehkuangcglobal block_error, 3, 3, 8, uqc, dqc, size, ssz 2391037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m4, m4 ; sse accumulator 2491037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m6, m6 ; ssz accumulator 2591037db265ecdd914a26e056cf69207b4f50924ehkuang pxor m5, m5 ; dedicated zero register 2691037db265ecdd914a26e056cf69207b4f50924ehkuang.loop: 278b92989c89bec8632aa47dc58dc162f199d62edcJames Zern LOAD_TRAN_LOW 2, uqcq, 0 288b92989c89bec8632aa47dc58dc162f199d62edcJames Zern LOAD_TRAN_LOW 0, dqcq, 0 298b92989c89bec8632aa47dc58dc162f199d62edcJames Zern LOAD_TRAN_LOW 3, uqcq, 8 308b92989c89bec8632aa47dc58dc162f199d62edcJames Zern LOAD_TRAN_LOW 1, dqcq, 8 318b92989c89bec8632aa47dc58dc162f199d62edcJames Zern INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 328b92989c89bec8632aa47dc58dc162f199d62edcJames Zern INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 338b92989c89bec8632aa47dc58dc162f199d62edcJames Zern sub sizeq, 16 3491037db265ecdd914a26e056cf69207b4f50924ehkuang psubw m0, m2 3591037db265ecdd914a26e056cf69207b4f50924ehkuang psubw m1, m3 3691037db265ecdd914a26e056cf69207b4f50924ehkuang ; individual errors are max. 15bit+sign, so squares are 30bit, and 3791037db265ecdd914a26e056cf69207b4f50924ehkuang ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 3891037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddwd m0, m0 3991037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddwd m1, m1 4091037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddwd m2, m2 4191037db265ecdd914a26e056cf69207b4f50924ehkuang pmaddwd m3, m3 428b92989c89bec8632aa47dc58dc162f199d62edcJames Zern ; the sum of 2 31bit integers will fit in a 32bit unsigned integer 438b92989c89bec8632aa47dc58dc162f199d62edcJames Zern paddd m0, m1 448b92989c89bec8632aa47dc58dc162f199d62edcJames Zern paddd m2, m3 4591037db265ecdd914a26e056cf69207b4f50924ehkuang ; accumulate in 64bit 4691037db265ecdd914a26e056cf69207b4f50924ehkuang punpckldq m7, m0, m5 4791037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhdq m0, m5 4891037db265ecdd914a26e056cf69207b4f50924ehkuang paddq m4, m7 4991037db265ecdd914a26e056cf69207b4f50924ehkuang punpckldq m7, m2, m5 508b92989c89bec8632aa47dc58dc162f199d62edcJames Zern paddq m4, m0 5191037db265ecdd914a26e056cf69207b4f50924ehkuang punpckhdq m2, m5 5291037db265ecdd914a26e056cf69207b4f50924ehkuang paddq m6, m7 5391037db265ecdd914a26e056cf69207b4f50924ehkuang paddq m6, m2 548b92989c89bec8632aa47dc58dc162f199d62edcJames Zern jg .loop 5591037db265ecdd914a26e056cf69207b4f50924ehkuang 5691037db265ecdd914a26e056cf69207b4f50924ehkuang ; accumulate horizontally and store in return value 5791037db265ecdd914a26e056cf69207b4f50924ehkuang movhlps m5, m4 5891037db265ecdd914a26e056cf69207b4f50924ehkuang movhlps m7, m6 5991037db265ecdd914a26e056cf69207b4f50924ehkuang paddq m4, m5 6091037db265ecdd914a26e056cf69207b4f50924ehkuang paddq m6, m7 6191037db265ecdd914a26e056cf69207b4f50924ehkuang%if ARCH_X86_64 6291037db265ecdd914a26e056cf69207b4f50924ehkuang movq rax, m4 6391037db265ecdd914a26e056cf69207b4f50924ehkuang movq [sszq], m6 6491037db265ecdd914a26e056cf69207b4f50924ehkuang%else 6591037db265ecdd914a26e056cf69207b4f50924ehkuang mov eax, sszm 6691037db265ecdd914a26e056cf69207b4f50924ehkuang pshufd m5, m4, 0x1 6791037db265ecdd914a26e056cf69207b4f50924ehkuang movq [eax], m6 6891037db265ecdd914a26e056cf69207b4f50924ehkuang movd eax, m4 6991037db265ecdd914a26e056cf69207b4f50924ehkuang movd edx, m5 7091037db265ecdd914a26e056cf69207b4f50924ehkuang%endif 7191037db265ecdd914a26e056cf69207b4f50924ehkuang RET 72da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 738b92989c89bec8632aa47dc58dc162f199d62edcJames Zern; Compute the sum of squared difference between two tran_low_t vectors. 748b92989c89bec8632aa47dc58dc162f199d62edcJames Zern; Vectors are converted (if necessary) to int16_t for calculations. 758b92989c89bec8632aa47dc58dc162f199d62edcJames Zern; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff, 76da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian; intptr_t block_size) 77da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 78da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh VenkatasubramanianINIT_XMM sse2 79da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramaniancglobal block_error_fp, 3, 3, 6, uqc, dqc, size 80da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pxor m4, m4 ; sse accumulator 81da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pxor m5, m5 ; dedicated zero register 82da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian.loop: 838b92989c89bec8632aa47dc58dc162f199d62edcJames Zern LOAD_TRAN_LOW 2, uqcq, 0 848b92989c89bec8632aa47dc58dc162f199d62edcJames Zern LOAD_TRAN_LOW 0, dqcq, 0 858b92989c89bec8632aa47dc58dc162f199d62edcJames Zern LOAD_TRAN_LOW 3, uqcq, 8 868b92989c89bec8632aa47dc58dc162f199d62edcJames Zern LOAD_TRAN_LOW 1, dqcq, 8 878b92989c89bec8632aa47dc58dc162f199d62edcJames Zern INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 888b92989c89bec8632aa47dc58dc162f199d62edcJames Zern INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 898b92989c89bec8632aa47dc58dc162f199d62edcJames Zern sub sizeq, 16 90da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian psubw m0, m2 91da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian psubw m1, m3 92da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ; individual errors are max. 15bit+sign, so squares are 30bit, and 93da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 94da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pmaddwd m0, m0 95da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pmaddwd m1, m1 968b92989c89bec8632aa47dc58dc162f199d62edcJames Zern ; the sum of 2 31bit integers will fit in a 32bit unsigned integer 978b92989c89bec8632aa47dc58dc162f199d62edcJames Zern paddd m0, m1 98da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ; accumulate in 64bit 99da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian punpckldq m3, m0, m5 100da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian punpckhdq m0, m5 101da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian paddq m4, m3 102da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian paddq m4, m0 1038b92989c89bec8632aa47dc58dc162f199d62edcJames Zern jnz .loop 104da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian 105da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian ; accumulate horizontally and store in return value 106da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian movhlps m5, m4 107da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian paddq m4, m5 108da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian%if ARCH_X86_64 109da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian movq rax, m4 110da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian%else 111da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian pshufd m5, m4, 0x1 112da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian movd eax, m4 113da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian movd edx, m5 114da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian%endif 115da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian RET 116