1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%define private_prefix vp9 12 13%include "third_party/x86inc/x86inc.asm" 14 15SECTION .text 16 17; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, 18; int64_t *ssz) 19 20INIT_XMM sse2 21cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz 22 pxor m4, m4 ; sse accumulator 23 pxor m6, m6 ; ssz accumulator 24 pxor m5, m5 ; dedicated zero register 25 lea uqcq, [uqcq+sizeq*2] 26 lea dqcq, [dqcq+sizeq*2] 27 neg sizeq 28.loop: 29 mova m2, [uqcq+sizeq*2] 30 mova m0, [dqcq+sizeq*2] 31 mova m3, [uqcq+sizeq*2+mmsize] 32 mova m1, [dqcq+sizeq*2+mmsize] 33 psubw m0, m2 34 psubw m1, m3 35 ; individual errors are max. 15bit+sign, so squares are 30bit, and 36 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 37 pmaddwd m0, m0 38 pmaddwd m1, m1 39 pmaddwd m2, m2 40 pmaddwd m3, m3 41 ; accumulate in 64bit 42 punpckldq m7, m0, m5 43 punpckhdq m0, m5 44 paddq m4, m7 45 punpckldq m7, m1, m5 46 paddq m4, m0 47 punpckhdq m1, m5 48 paddq m4, m7 49 punpckldq m7, m2, m5 50 paddq m4, m1 51 punpckhdq m2, m5 52 paddq m6, m7 53 punpckldq m7, m3, m5 54 paddq m6, m2 55 punpckhdq m3, m5 56 paddq m6, m7 57 paddq m6, m3 58 add sizeq, mmsize 59 jl .loop 60 61 ; accumulate horizontally and store in return value 62 movhlps m5, m4 63 movhlps m7, m6 64 paddq m4, m5 65 paddq m6, m7 66%if ARCH_X86_64 67 movq rax, m4 68 movq [sszq], m6 69%else 70 mov eax, sszm 71 pshufd m5, m4, 0x1 72 movq [eax], m6 73 movd eax, m4 74 movd edx, m5 75%endif 76 RET 77 78; Compute the sum of squared difference between two int16_t vectors. 79; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff, 80; intptr_t block_size) 81 82INIT_XMM sse2 83cglobal block_error_fp, 3, 3, 6, uqc, dqc, size 84 pxor m4, m4 ; sse accumulator 85 pxor m5, m5 ; dedicated zero register 86 lea uqcq, [uqcq+sizeq*2] 87 lea dqcq, [dqcq+sizeq*2] 88 neg sizeq 89.loop: 90 mova m2, [uqcq+sizeq*2] 91 mova m0, [dqcq+sizeq*2] 92 mova m3, [uqcq+sizeq*2+mmsize] 93 mova m1, [dqcq+sizeq*2+mmsize] 94 psubw m0, m2 95 psubw m1, m3 96 ; individual errors are max. 15bit+sign, so squares are 30bit, and 97 ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) 98 pmaddwd m0, m0 99 pmaddwd m1, m1 100 ; accumulate in 64bit 101 punpckldq m3, m0, m5 102 punpckhdq m0, m5 103 paddq m4, m3 104 punpckldq m3, m1, m5 105 paddq m4, m0 106 punpckhdq m1, m5 107 paddq m4, m3 108 paddq m4, m1 109 add sizeq, mmsize 110 jl .loop 111 112 ; accumulate horizontally and store in return value 113 movhlps m5, m4 114 paddq m4, m5 115%if ARCH_X86_64 116 movq rax, m4 117%else 118 pshufd m5, m4, 0x1 119 movd eax, m4 120 movd edx, m5 121%endif 122 RET 123