1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%define private_prefix vp9 12 13%include "third_party/x86inc/x86inc.asm" 14%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" 15 16SECTION .text 17 18; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, 19; int64_t *ssz) 20 21INIT_XMM sse2 22cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz 23 pxor m4, m4 ; sse accumulator 24 pxor m6, m6 ; ssz accumulator 25 pxor m5, m5 ; dedicated zero register 26.loop: 27 LOAD_TRAN_LOW 2, uqcq, 0 28 LOAD_TRAN_LOW 0, dqcq, 0 29 LOAD_TRAN_LOW 3, uqcq, 8 30 LOAD_TRAN_LOW 1, dqcq, 8 31 INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 32 INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 33 sub sizeq, 16 34 psubw m0, m2 35 psubw m1, m3 36 ; individual errors are max. 15bit+sign, so squares are 30bit, and 37 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 38 pmaddwd m0, m0 39 pmaddwd m1, m1 40 pmaddwd m2, m2 41 pmaddwd m3, m3 42 ; the sum of 2 31bit integers will fit in a 32bit unsigned integer 43 paddd m0, m1 44 paddd m2, m3 45 ; accumulate in 64bit 46 punpckldq m7, m0, m5 47 punpckhdq m0, m5 48 paddq m4, m7 49 punpckldq m7, m2, m5 50 paddq m4, m0 51 punpckhdq m2, m5 52 paddq m6, m7 53 paddq m6, m2 54 jg .loop 55 56 ; accumulate horizontally and store in return value 57 movhlps m5, m4 58 movhlps m7, m6 59 paddq m4, m5 60 paddq m6, m7 61%if ARCH_X86_64 62 movq rax, m4 63 movq [sszq], m6 64%else 65 mov eax, sszm 66 pshufd m5, m4, 0x1 67 movq [eax], m6 68 movd eax, m4 69 movd edx, m5 70%endif 71 RET 72 73; Compute the sum of squared difference between two tran_low_t vectors. 74; Vectors are converted (if necessary) to int16_t for calculations. 75; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff, 76; intptr_t block_size) 77 78INIT_XMM sse2 79cglobal block_error_fp, 3, 3, 6, uqc, dqc, size 80 pxor m4, m4 ; sse accumulator 81 pxor m5, m5 ; dedicated zero register 82.loop: 83 LOAD_TRAN_LOW 2, uqcq, 0 84 LOAD_TRAN_LOW 0, dqcq, 0 85 LOAD_TRAN_LOW 3, uqcq, 8 86 LOAD_TRAN_LOW 1, dqcq, 8 87 INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 88 INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 89 sub sizeq, 16 90 psubw m0, m2 91 psubw m1, m3 92 ; individual errors are max. 15bit+sign, so squares are 30bit, and 93 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 94 pmaddwd m0, m0 95 pmaddwd m1, m1 96 ; the sum of 2 31bit integers will fit in a 32bit unsigned integer 97 paddd m0, m1 98 ; accumulate in 64bit 99 punpckldq m3, m0, m5 100 punpckhdq m0, m5 101 paddq m4, m3 102 paddq m4, m0 103 jnz .loop 104 105 ; accumulate horizontally and store in return value 106 movhlps m5, m4 107 paddq m4, m5 108%if ARCH_X86_64 109 movq rax, m4 110%else 111 pshufd m5, m4, 0x1 112 movd eax, m4 113 movd edx, m5 114%endif 115 RET 116