1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14SECTION .text
15
16;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
17global sym(vp8_short_walsh4x4_sse2) PRIVATE
18sym(vp8_short_walsh4x4_sse2):
19    push        rbp
20    mov         rbp, rsp
21    SHADOW_ARGS_TO_STACK 3
22    SAVE_XMM 7
23    GET_GOT     rbx
24    push        rsi
25    push        rdi
26    ; end prolog
27
28    mov     rsi, arg(0)           ; input
29    mov     rdi, arg(1)           ; output
30    movsxd  rdx, dword ptr arg(2) ; pitch
31
32    ; first for loop
33    movq    xmm0, MMWORD PTR [rsi]           ; load input
34    movq    xmm1, MMWORD PTR [rsi + rdx]
35    lea     rsi,  [rsi + rdx*2]
36    movq    xmm2, MMWORD PTR [rsi]
37    movq    xmm3, MMWORD PTR [rsi + rdx]
38
39    punpcklwd xmm0,  xmm1
40    punpcklwd xmm2,  xmm3
41
42    movdqa    xmm1, xmm0
43    punpckldq xmm0, xmm2           ; ip[1] ip[0]
44    punpckhdq xmm1, xmm2           ; ip[3] ip[2]
45
46    movdqa    xmm2, xmm0
47    paddw     xmm0, xmm1
48    psubw     xmm2, xmm1
49
50    psllw     xmm0, 2              ; d1  a1
51    psllw     xmm2, 2              ; c1  b1
52
53    movdqa    xmm1, xmm0
54    punpcklqdq xmm0, xmm2          ; b1  a1
55    punpckhqdq xmm1, xmm2          ; c1  d1
56
57    pxor      xmm6, xmm6
58    movq      xmm6, xmm0
59    pxor      xmm7, xmm7
60    pcmpeqw   xmm7, xmm6
61    paddw     xmm7, [GLOBAL(c1)]
62
63    movdqa    xmm2, xmm0
64    paddw     xmm0, xmm1           ; b1+c1  a1+d1
65    psubw     xmm2, xmm1           ; b1-c1  a1-d1
66    paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
67
68    ; second for loop
69    ; input: 13  9  5  1 12  8  4  0 (xmm0)
70    ;        14 10  6  2 15 11  7  3 (xmm2)
71    ; after shuffle:
72    ;        13  5  9  1 12  4  8  0 (xmm0)
73    ;        14  6 10  2 15  7 11  3 (xmm1)
74    pshuflw   xmm3, xmm0, 0xd8
75    pshufhw   xmm0, xmm3, 0xd8
76    pshuflw   xmm3, xmm2, 0xd8
77    pshufhw   xmm1, xmm3, 0xd8
78
79    movdqa    xmm2, xmm0
80    pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
81    pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
82    movdqa    xmm3, xmm1
83    pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
84    pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
85
86    pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
87    pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
88    pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
89    pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
90
91    movdqa    xmm0, xmm4
92    punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
93    punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
94    movdqa    xmm1, xmm6
95    punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
96    punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
97
98    movdqa    xmm2, xmm0
99    paddd     xmm0, xmm4            ; b21 b20 a21 a20
100    psubd     xmm2, xmm4            ; c21 c20 d21 d20
101    movdqa    xmm3, xmm1
102    paddd     xmm1, xmm6            ; b23 b22 a23 a22
103    psubd     xmm3, xmm6            ; c23 c22 d23 d22
104
105    pxor      xmm4, xmm4
106    movdqa    xmm5, xmm4
107    pcmpgtd   xmm4, xmm0
108    pcmpgtd   xmm5, xmm2
109    pand      xmm4, [GLOBAL(cd1)]
110    pand      xmm5, [GLOBAL(cd1)]
111
112    pxor      xmm6, xmm6
113    movdqa    xmm7, xmm6
114    pcmpgtd   xmm6, xmm1
115    pcmpgtd   xmm7, xmm3
116    pand      xmm6, [GLOBAL(cd1)]
117    pand      xmm7, [GLOBAL(cd1)]
118
119    paddd     xmm0, xmm4
120    paddd     xmm2, xmm5
121    paddd     xmm0, [GLOBAL(cd3)]
122    paddd     xmm2, [GLOBAL(cd3)]
123    paddd     xmm1, xmm6
124    paddd     xmm3, xmm7
125    paddd     xmm1, [GLOBAL(cd3)]
126    paddd     xmm3, [GLOBAL(cd3)]
127
128    psrad     xmm0, 3
129    psrad     xmm1, 3
130    psrad     xmm2, 3
131    psrad     xmm3, 3
132    movdqa    xmm4, xmm0
133    punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
134    punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
135    movdqa    xmm5, xmm2
136    punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
137    punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
138
139    packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
140    packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
141
142    movdqa  XMMWORD PTR [rdi], xmm0
143    movdqa  XMMWORD PTR [rdi + 16], xmm2
144
145    ; begin epilog
146    pop rdi
147    pop rsi
148    RESTORE_GOT
149    RESTORE_XMM
150    UNSHADOW_ARGS
151    pop         rbp
152    ret
153
154SECTION_RODATA
155align 16
156c1:
157    dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
158align 16
159cn1:
160    dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
161align 16
162cd1:
163    dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
164align 16
165cd3:
166    dd 0x00000003, 0x00000003, 0x00000003, 0x00000003
167