1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%define private_prefix vp9
12
13%include "third_party/x86inc/x86inc.asm"
14
15SECTION .text
16
17; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
18;                         int64_t *ssz)
19
20INIT_XMM sse2
21cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
22  pxor      m4, m4                 ; sse accumulator
23  pxor      m6, m6                 ; ssz accumulator
24  pxor      m5, m5                 ; dedicated zero register
25  lea     uqcq, [uqcq+sizeq*2]
26  lea     dqcq, [dqcq+sizeq*2]
27  neg    sizeq
28.loop:
29  mova      m2, [uqcq+sizeq*2]
30  mova      m0, [dqcq+sizeq*2]
31  mova      m3, [uqcq+sizeq*2+mmsize]
32  mova      m1, [dqcq+sizeq*2+mmsize]
33  psubw     m0, m2
34  psubw     m1, m3
35  ; individual errors are max. 15bit+sign, so squares are 30bit, and
36  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
37  pmaddwd   m0, m0
38  pmaddwd   m1, m1
39  pmaddwd   m2, m2
40  pmaddwd   m3, m3
41  ; accumulate in 64bit
42  punpckldq m7, m0, m5
43  punpckhdq m0, m5
44  paddq     m4, m7
45  punpckldq m7, m1, m5
46  paddq     m4, m0
47  punpckhdq m1, m5
48  paddq     m4, m7
49  punpckldq m7, m2, m5
50  paddq     m4, m1
51  punpckhdq m2, m5
52  paddq     m6, m7
53  punpckldq m7, m3, m5
54  paddq     m6, m2
55  punpckhdq m3, m5
56  paddq     m6, m7
57  paddq     m6, m3
58  add    sizeq, mmsize
59  jl .loop
60
61  ; accumulate horizontally and store in return value
62  movhlps   m5, m4
63  movhlps   m7, m6
64  paddq     m4, m5
65  paddq     m6, m7
66%if ARCH_X86_64
67  movq    rax, m4
68  movq [sszq], m6
69%else
70  mov     eax, sszm
71  pshufd   m5, m4, 0x1
72  movq  [eax], m6
73  movd    eax, m4
74  movd    edx, m5
75%endif
76  RET
77
78; Compute the sum of squared difference between two int16_t vectors.
79; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
80;                            intptr_t block_size)
81
82INIT_XMM sse2
83cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
84  pxor      m4, m4                 ; sse accumulator
85  pxor      m5, m5                 ; dedicated zero register
86  lea     uqcq, [uqcq+sizeq*2]
87  lea     dqcq, [dqcq+sizeq*2]
88  neg    sizeq
89.loop:
90  mova      m2, [uqcq+sizeq*2]
91  mova      m0, [dqcq+sizeq*2]
92  mova      m3, [uqcq+sizeq*2+mmsize]
93  mova      m1, [dqcq+sizeq*2+mmsize]
94  psubw     m0, m2
95  psubw     m1, m3
96  ; individual errors are max. 15bit+sign, so squares are 30bit, and
97  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
98  pmaddwd   m0, m0
99  pmaddwd   m1, m1
100  ; accumulate in 64bit
101  punpckldq m3, m0, m5
102  punpckhdq m0, m5
103  paddq     m4, m3
104  punpckldq m3, m1, m5
105  paddq     m4, m0
106  punpckhdq m1, m5
107  paddq     m4, m3
108  paddq     m4, m1
109  add    sizeq, mmsize
110  jl .loop
111
112  ; accumulate horizontally and store in return value
113  movhlps   m5, m4
114  paddq     m4, m5
115%if ARCH_X86_64
116  movq    rax, m4
117%else
118  pshufd   m5, m4, 0x1
119  movd    eax, m4
120  movd    edx, m5
121%endif
122  RET
123