1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
13
14SECTION .text
15
16%if ARCH_X86_64
17; matrix transpose
18%macro TRANSPOSE8X8 10
19  ; stage 1
20  punpcklwd  m%9, m%1, m%2
21  punpcklwd  m%10, m%3, m%4
22  punpckhwd  m%1, m%2
23  punpckhwd  m%3, m%4
24
25  punpcklwd  m%2, m%5, m%6
26  punpcklwd  m%4, m%7, m%8
27  punpckhwd  m%5, m%6
28  punpckhwd  m%7, m%8
29
30  ; stage 2
31  punpckldq  m%6, m%9, m%10
32  punpckldq  m%8, m%1, m%3
33  punpckhdq  m%9, m%10
34  punpckhdq  m%1, m%3
35
36  punpckldq  m%10, m%2, m%4
37  punpckldq  m%3, m%5, m%7
38  punpckhdq  m%2, m%4
39  punpckhdq  m%5, m%7
40
41  ; stage 3
42  punpckhqdq  m%4, m%9, m%2  ; out3
43  punpcklqdq  m%9, m%2       ; out2
44  punpcklqdq  m%7, m%1, m%5  ; out6
45  punpckhqdq  m%1, m%5       ; out7
46
47  punpckhqdq  m%2, m%6, m%10 ; out1
48  punpcklqdq  m%6, m%10      ; out0
49  punpcklqdq  m%5, m%8, m%3  ; out4
50  punpckhqdq  m%8, m%3       ; out5
51
52  SWAP %6, %1
53  SWAP %3, %9
54  SWAP %8, %6
55%endmacro
56
57%macro HMD8_1D 0
58  psubw              m8, m0, m1
59  psubw              m9, m2, m3
60  paddw              m0, m1
61  paddw              m2, m3
62  SWAP               1, 8
63  SWAP               3, 9
64  psubw              m8, m4, m5
65  psubw              m9, m6, m7
66  paddw              m4, m5
67  paddw              m6, m7
68  SWAP               5, 8
69  SWAP               7, 9
70
71  psubw              m8, m0, m2
72  psubw              m9, m1, m3
73  paddw              m0, m2
74  paddw              m1, m3
75  SWAP               2, 8
76  SWAP               3, 9
77  psubw              m8, m4, m6
78  psubw              m9, m5, m7
79  paddw              m4, m6
80  paddw              m5, m7
81  SWAP               6, 8
82  SWAP               7, 9
83
84  psubw              m8, m0, m4
85  psubw              m9, m1, m5
86  paddw              m0, m4
87  paddw              m1, m5
88  SWAP               4, 8
89  SWAP               5, 9
90  psubw              m8, m2, m6
91  psubw              m9, m3, m7
92  paddw              m2, m6
93  paddw              m3, m7
94  SWAP               6, 8
95  SWAP               7, 9
96%endmacro
97
98
99INIT_XMM ssse3
100cglobal hadamard_8x8, 3, 5, 11, input, stride, output
101  lea                r3, [2 * strideq]
102  lea                r4, [4 * strideq]
103
104  mova               m0, [inputq]
105  mova               m1, [inputq + r3]
106  lea                inputq, [inputq + r4]
107  mova               m2, [inputq]
108  mova               m3, [inputq + r3]
109  lea                inputq, [inputq + r4]
110  mova               m4, [inputq]
111  mova               m5, [inputq + r3]
112  lea                inputq, [inputq + r4]
113  mova               m6, [inputq]
114  mova               m7, [inputq + r3]
115
116  HMD8_1D
117  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
118  HMD8_1D
119
120  STORE_TRAN_LOW 0, outputq,  0, 8, 9
121  STORE_TRAN_LOW 1, outputq,  8, 8, 9
122  STORE_TRAN_LOW 2, outputq, 16, 8, 9
123  STORE_TRAN_LOW 3, outputq, 24, 8, 9
124  STORE_TRAN_LOW 4, outputq, 32, 8, 9
125  STORE_TRAN_LOW 5, outputq, 40, 8, 9
126  STORE_TRAN_LOW 6, outputq, 48, 8, 9
127  STORE_TRAN_LOW 7, outputq, 56, 8, 9
128
129  RET
130%endif
131