1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%define private_prefix vp9
12
13%include "third_party/x86inc/x86inc.asm"
14%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
15
16SECTION .text
17
18; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
19;                         int64_t *ssz)
20
21INIT_XMM sse2
22cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
23  pxor      m4, m4                 ; sse accumulator
24  pxor      m6, m6                 ; ssz accumulator
25  pxor      m5, m5                 ; dedicated zero register
26.loop:
27  LOAD_TRAN_LOW 2, uqcq, 0
28  LOAD_TRAN_LOW 0, dqcq, 0
29  LOAD_TRAN_LOW 3, uqcq, 8
30  LOAD_TRAN_LOW 1, dqcq, 8
31  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
32  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
33  sub    sizeq, 16
34  psubw     m0, m2
35  psubw     m1, m3
36  ; individual errors are max. 15bit+sign, so squares are 30bit, and
37  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
38  pmaddwd   m0, m0
39  pmaddwd   m1, m1
40  pmaddwd   m2, m2
41  pmaddwd   m3, m3
42  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
43  paddd     m0, m1
44  paddd     m2, m3
45  ; accumulate in 64bit
46  punpckldq m7, m0, m5
47  punpckhdq m0, m5
48  paddq     m4, m7
49  punpckldq m7, m2, m5
50  paddq     m4, m0
51  punpckhdq m2, m5
52  paddq     m6, m7
53  paddq     m6, m2
54  jg .loop
55
56  ; accumulate horizontally and store in return value
57  movhlps   m5, m4
58  movhlps   m7, m6
59  paddq     m4, m5
60  paddq     m6, m7
61%if ARCH_X86_64
62  movq    rax, m4
63  movq [sszq], m6
64%else
65  mov     eax, sszm
66  pshufd   m5, m4, 0x1
67  movq  [eax], m6
68  movd    eax, m4
69  movd    edx, m5
70%endif
71  RET
72
73; Compute the sum of squared difference between two tran_low_t vectors.
74; Vectors are converted (if necessary) to int16_t for calculations.
75; int64_t vp9_block_error_fp(tran_low_t *coeff, tran_low_t *dqcoeff,
76;                            intptr_t block_size)
77
78INIT_XMM sse2
79cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
80  pxor      m4, m4                 ; sse accumulator
81  pxor      m5, m5                 ; dedicated zero register
82.loop:
83  LOAD_TRAN_LOW 2, uqcq, 0
84  LOAD_TRAN_LOW 0, dqcq, 0
85  LOAD_TRAN_LOW 3, uqcq, 8
86  LOAD_TRAN_LOW 1, dqcq, 8
87  INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16
88  INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16
89  sub    sizeq, 16
90  psubw     m0, m2
91  psubw     m1, m3
92  ; individual errors are max. 15bit+sign, so squares are 30bit, and
93  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
94  pmaddwd   m0, m0
95  pmaddwd   m1, m1
96  ; the sum of 2 31bit integers will fit in a 32bit unsigned integer
97  paddd     m0, m1
98  ; accumulate in 64bit
99  punpckldq m3, m0, m5
100  punpckhdq m0, m5
101  paddq     m4, m3
102  paddq     m4, m0
103  jnz .loop
104
105  ; accumulate horizontally and store in return value
106  movhlps   m5, m4
107  paddq     m4, m5
108%if ARCH_X86_64
109  movq    rax, m4
110%else
111  pshufd   m5, m4, 0x1
112  movd    eax, m4
113  movd    edx, m5
114%endif
115  RET
116