1;
2;  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%define private_prefix vp9
12
13%include "third_party/x86inc/x86inc.asm"
14%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
15
16SECTION .text
17
18%macro TRANSFORM_COLS 0
19  paddw           m0,        m1
20  movq            m4,        m0
21  psubw           m3,        m2
22  psubw           m4,        m3
23  psraw           m4,        1
24  movq            m5,        m4
25  psubw           m5,        m1 ;b1
26  psubw           m4,        m2 ;c1
27  psubw           m0,        m4
28  paddw           m3,        m5
29                                ; m0 a0
30  SWAP            1,         4  ; m1 c1
31  SWAP            2,         3  ; m2 d1
32  SWAP            3,         5  ; m3 b1
33%endmacro
34
35%macro TRANSPOSE_4X4 0
36                                ; 00 01 02 03
37                                ; 10 11 12 13
38                                ; 20 21 22 23
39                                ; 30 31 32 33
40  punpcklwd       m0,        m1 ; 00 10 01 11  02 12 03 13
41  punpcklwd       m2,        m3 ; 20 30 21 31  22 32 23 33
42  mova            m1,        m0
43  punpckldq       m0,        m2 ; 00 10 20 30  01 11 21 31
44  punpckhdq       m1,        m2 ; 02 12 22 32  03 13 23 33
45%endmacro
46
47INIT_XMM sse2
48cglobal fwht4x4, 3, 4, 8, input, output, stride
49  lea             r3q,       [inputq + strideq*4]
50  movq            m0,        [inputq] ;a1
51  movq            m1,        [inputq + strideq*2] ;b1
52  movq            m2,        [r3q] ;c1
53  movq            m3,        [r3q + strideq*2] ;d1
54
55  TRANSFORM_COLS
56  TRANSPOSE_4X4
57  SWAP            1,         2
58  psrldq          m1,        m0, 8
59  psrldq          m3,        m2, 8
60  TRANSFORM_COLS
61  TRANSPOSE_4X4
62
63  psllw           m0,        2
64  psllw           m1,        2
65
66  STORE_TRAN_LOW 0, outputq, 0, 2, 3
67  STORE_TRAN_LOW 1, outputq, 8, 2, 3
68
69  RET
70