1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
14global sym(vp8_recon2b_sse2)
15sym(vp8_recon2b_sse2):
16    push        rbp
17    mov         rbp, rsp
18    SHADOW_ARGS_TO_STACK 4
19    push        rsi
20    push        rdi
21    ; end prolog
22
23        mov         rsi,        arg(0) ;s
24        mov         rdi,        arg(2) ;d
25        mov         rdx,        arg(1) ;q
26        movsxd      rax,        dword ptr arg(3) ;stride
27        pxor        xmm0,       xmm0
28
29        movq        xmm1,       MMWORD PTR [rsi]
30        punpcklbw   xmm1,       xmm0
31        paddsw      xmm1,       XMMWORD PTR [rdx]
32        packuswb    xmm1,       xmm0              ; pack and unpack to saturate
33        movq        MMWORD PTR [rdi],   xmm1
34
35
36        movq        xmm2,       MMWORD PTR [rsi+8]
37        punpcklbw   xmm2,       xmm0
38        paddsw      xmm2,       XMMWORD PTR [rdx+16]
39        packuswb    xmm2,       xmm0              ; pack and unpack to saturate
40        movq        MMWORD PTR [rdi+rax],   xmm2
41
42
43        movq        xmm3,       MMWORD PTR [rsi+16]
44        punpcklbw   xmm3,       xmm0
45        paddsw      xmm3,       XMMWORD PTR [rdx+32]
46        packuswb    xmm3,       xmm0              ; pack and unpack to saturate
47        movq        MMWORD PTR [rdi+rax*2], xmm3
48
49        add         rdi, rax
50        movq        xmm4,       MMWORD PTR [rsi+24]
51        punpcklbw   xmm4,       xmm0
52        paddsw      xmm4,       XMMWORD PTR [rdx+48]
53        packuswb    xmm4,       xmm0              ; pack and unpack to saturate
54        movq        MMWORD PTR [rdi+rax*2], xmm4
55
56    ; begin epilog
57    pop rdi
58    pop rsi
59    UNSHADOW_ARGS
60    pop         rbp
61    ret
62
63
64;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
65global sym(vp8_recon4b_sse2)
66sym(vp8_recon4b_sse2):
67    push        rbp
68    mov         rbp, rsp
69    SHADOW_ARGS_TO_STACK 4
70    SAVE_XMM
71    push        rsi
72    push        rdi
73    ; end prolog
74
75        mov         rsi,        arg(0) ;s
76        mov         rdi,        arg(2) ;d
77        mov         rdx,        arg(1) ;q
78        movsxd      rax,        dword ptr arg(3) ;stride
79        pxor        xmm0,       xmm0
80
81        movdqa      xmm1,       XMMWORD PTR [rsi]
82        movdqa      xmm5,       xmm1
83        punpcklbw   xmm1,       xmm0
84        punpckhbw   xmm5,       xmm0
85        paddsw      xmm1,       XMMWORD PTR [rdx]
86        paddsw      xmm5,       XMMWORD PTR [rdx+16]
87        packuswb    xmm1,       xmm5              ; pack and unpack to saturate
88        movdqa      XMMWORD PTR [rdi],  xmm1
89
90
91        movdqa      xmm2,       XMMWORD PTR [rsi+16]
92        movdqa      xmm6,       xmm2
93        punpcklbw   xmm2,       xmm0
94        punpckhbw   xmm6,       xmm0
95        paddsw      xmm2,       XMMWORD PTR [rdx+32]
96        paddsw      xmm6,       XMMWORD PTR [rdx+48]
97        packuswb    xmm2,       xmm6              ; pack and unpack to saturate
98        movdqa      XMMWORD PTR [rdi+rax],  xmm2
99
100
101        movdqa      xmm3,       XMMWORD PTR [rsi+32]
102        movdqa      xmm7,       xmm3
103        punpcklbw   xmm3,       xmm0
104        punpckhbw   xmm7,       xmm0
105        paddsw      xmm3,       XMMWORD PTR [rdx+64]
106        paddsw      xmm7,       XMMWORD PTR [rdx+80]
107        packuswb    xmm3,       xmm7              ; pack and unpack to saturate
108        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3
109
110        add       rdi, rax
111        movdqa      xmm4,       XMMWORD PTR [rsi+48]
112        movdqa      xmm5,       xmm4
113        punpcklbw   xmm4,       xmm0
114        punpckhbw   xmm5,       xmm0
115        paddsw      xmm4,       XMMWORD PTR [rdx+96]
116        paddsw      xmm5,       XMMWORD PTR [rdx+112]
117        packuswb    xmm4,       xmm5              ; pack and unpack to saturate
118        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4
119
120    ; begin epilog
121    pop rdi
122    pop rsi
123    RESTORE_XMM
124    UNSHADOW_ARGS
125    pop         rbp
126    ret
127
128
129;void copy_mem16x16_sse2(
130;    unsigned char *src,
131;    int src_stride,
132;    unsigned char *dst,
133;    int dst_stride
134;    )
135global sym(vp8_copy_mem16x16_sse2)
136sym(vp8_copy_mem16x16_sse2):
137    push        rbp
138    mov         rbp, rsp
139    SHADOW_ARGS_TO_STACK 4
140    push        rsi
141    push        rdi
142    ; end prolog
143
144        mov         rsi,        arg(0) ;src;
145        movdqu      xmm0,       [rsi]
146
147        movsxd      rax,        dword ptr arg(1) ;src_stride;
148        mov         rdi,        arg(2) ;dst;
149
150        movdqu      xmm1,       [rsi+rax]
151        movdqu      xmm2,       [rsi+rax*2]
152
153        movsxd      rcx,        dword ptr arg(3) ;dst_stride
154        lea         rsi,        [rsi+rax*2]
155
156        movdqa      [rdi],      xmm0
157        add         rsi,        rax
158
159        movdqa      [rdi+rcx],  xmm1
160        movdqa      [rdi+rcx*2],xmm2
161
162        lea         rdi,        [rdi+rcx*2]
163        movdqu      xmm3,       [rsi]
164
165        add         rdi,        rcx
166        movdqu      xmm4,       [rsi+rax]
167
168        movdqu      xmm5,       [rsi+rax*2]
169        lea         rsi,        [rsi+rax*2]
170
171        movdqa      [rdi],  xmm3
172        add         rsi,        rax
173
174        movdqa      [rdi+rcx],  xmm4
175        movdqa      [rdi+rcx*2],xmm5
176
177        lea         rdi,        [rdi+rcx*2]
178        movdqu      xmm0,       [rsi]
179
180        add         rdi,        rcx
181        movdqu      xmm1,       [rsi+rax]
182
183        movdqu      xmm2,       [rsi+rax*2]
184        lea         rsi,        [rsi+rax*2]
185
186        movdqa      [rdi],      xmm0
187        add         rsi,        rax
188
189        movdqa      [rdi+rcx],  xmm1
190
191        movdqa      [rdi+rcx*2],    xmm2
192        movdqu      xmm3,       [rsi]
193
194        movdqu      xmm4,       [rsi+rax]
195        lea         rdi,        [rdi+rcx*2]
196
197        add         rdi,        rcx
198        movdqu      xmm5,       [rsi+rax*2]
199
200        lea         rsi,        [rsi+rax*2]
201        movdqa      [rdi],  xmm3
202
203        add         rsi,        rax
204        movdqa      [rdi+rcx],  xmm4
205
206        movdqa      [rdi+rcx*2],xmm5
207        movdqu      xmm0,       [rsi]
208
209        lea         rdi,        [rdi+rcx*2]
210        movdqu      xmm1,       [rsi+rax]
211
212        add         rdi,        rcx
213        movdqu      xmm2,       [rsi+rax*2]
214
215        lea         rsi,        [rsi+rax*2]
216        movdqa      [rdi],      xmm0
217
218        movdqa      [rdi+rcx],  xmm1
219        movdqa      [rdi+rcx*2],xmm2
220
221        movdqu      xmm3,       [rsi+rax]
222        lea         rdi,        [rdi+rcx*2]
223
224        movdqa      [rdi+rcx],  xmm3
225
226    ; begin epilog
227    pop rdi
228    pop rsi
229    UNSHADOW_ARGS
230    pop         rbp
231    ret
232