1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10%include "third_party/x86inc/x86inc.asm"
11
12SECTION .text
13
14%macro TRANSFORM_COLS 0
15  paddw           m0,        m1
16  movq            m4,        m0
17  psubw           m3,        m2
18  psubw           m4,        m3
19  psraw           m4,        1
20  movq            m5,        m4
21  psubw           m5,        m1 ;b1
22  psubw           m4,        m2 ;c1
23  psubw           m0,        m4
24  paddw           m3,        m5
25                                ; m0 a0
26  SWAP            1,         4  ; m1 c1
27  SWAP            2,         3  ; m2 d1
28  SWAP            3,         5  ; m3 b1
29%endmacro
30
31%macro TRANSPOSE_4X4 0
32  movq            m4,        m0
33  movq            m5,        m2
34  punpcklwd       m4,        m1
35  punpckhwd       m0,        m1
36  punpcklwd       m5,        m3
37  punpckhwd       m2,        m3
38  movq            m1,        m4
39  movq            m3,        m0
40  punpckldq       m1,        m5
41  punpckhdq       m4,        m5
42  punpckldq       m3,        m2
43  punpckhdq       m0,        m2
44  SWAP            2, 3, 0, 1, 4
45%endmacro
46
47INIT_MMX mmx
48cglobal fwht4x4, 3, 4, 8, input, output, stride
49  lea             r3q,       [inputq + strideq*4]
50  movq            m0,        [inputq] ;a1
51  movq            m1,        [inputq + strideq*2] ;b1
52  movq            m2,        [r3q] ;c1
53  movq            m3,        [r3q + strideq*2] ;d1
54
55  TRANSFORM_COLS
56  TRANSPOSE_4X4
57  TRANSFORM_COLS
58  TRANSPOSE_4X4
59
60  psllw           m0,        2
61  psllw           m1,        2
62  psllw           m2,        2
63  psllw           m3,        2
64
65  movq            [outputq],      m0
66  movq            [outputq + 8],  m1
67  movq            [outputq + 16], m2
68  movq            [outputq + 24], m3
69
70  RET
71