1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
15;                            short *diff, unsigned char *Predictor,
16;                            int pitch);
17global sym(vp8_subtract_b_sse2_impl)
18sym(vp8_subtract_b_sse2_impl):
19    push        rbp
20    mov         rbp, rsp
21    SHADOW_ARGS_TO_STACK 5
22    GET_GOT     rbx
23    push rsi
24    push rdi
25    ; end prolog
26
27        mov     rdi,        arg(2) ;diff
28        mov     rax,        arg(3) ;Predictor
29        mov     rsi,        arg(0) ;z
30        movsxd  rdx,        dword ptr arg(1);src_stride;
31        movsxd  rcx,        dword ptr arg(4);pitch
32        pxor    mm7,        mm7
33
34        movd    mm0,        [rsi]
35        movd    mm1,        [rax]
36        punpcklbw   mm0,    mm7
37        punpcklbw   mm1,    mm7
38        psubw   mm0,        mm1
39        movq    MMWORD PTR [rdi],      mm0
40
41        movd    mm0,        [rsi+rdx]
42        movd    mm1,        [rax+rcx]
43        punpcklbw   mm0,    mm7
44        punpcklbw   mm1,    mm7
45        psubw   mm0,        mm1
46        movq    MMWORD PTR [rdi+rcx*2], mm0
47
48        movd    mm0,        [rsi+rdx*2]
49        movd    mm1,        [rax+rcx*2]
50        punpcklbw   mm0,    mm7
51        punpcklbw   mm1,    mm7
52        psubw   mm0,        mm1
53        movq    MMWORD PTR [rdi+rcx*4], mm0
54
55        lea     rsi,        [rsi+rdx*2]
56        lea     rcx,        [rcx+rcx*2]
57
58        movd    mm0,        [rsi+rdx]
59        movd    mm1,        [rax+rcx]
60        punpcklbw   mm0,    mm7
61        punpcklbw   mm1,    mm7
62        psubw   mm0,        mm1
63        movq    MMWORD PTR [rdi+rcx*2], mm0
64
65    ; begin epilog
66    pop rdi
67    pop rsi
68    RESTORE_GOT
69    UNSHADOW_ARGS
70    pop         rbp
71    ret
72
73
74;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
75global sym(vp8_subtract_mby_sse2)
76sym(vp8_subtract_mby_sse2):
77    push        rbp
78    mov         rbp, rsp
79    SHADOW_ARGS_TO_STACK 4
80    SAVE_XMM
81    GET_GOT     rbx
82    push rsi
83    push rdi
84    ; end prolog
85
86            mov         rsi,            arg(1) ;src
87            mov         rdi,            arg(0) ;diff
88
89            mov         rax,            arg(2) ;pred
90            movsxd      rdx,            dword ptr arg(3) ;stride
91
92            mov         rcx,            8      ; do two lines at one time
93
94submby_loop:
95            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
96            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
97
98            movdqa      xmm2,           xmm0
99            psubb       xmm0,           xmm1
100
101            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
102            pxor        xmm2,           [GLOBAL(t80)]
103            pcmpgtb     xmm1,           xmm2            ; obtain sign information
104
105            movdqa      xmm2,    xmm0
106            movdqa      xmm3,    xmm1
107            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
108            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
109
110            movdqa      XMMWORD PTR [rdi],   xmm0
111            movdqa      XMMWORD PTR [rdi +16], xmm2
112
113            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
114            movdqa      xmm5,           XMMWORD PTR [rax + 16]
115
116            movdqa      xmm6,           xmm4
117            psubb       xmm4,           xmm5
118
119            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
120            pxor        xmm6,           [GLOBAL(t80)]
121            pcmpgtb     xmm5,           xmm6            ; obtain sign information
122
123            movdqa      xmm6,    xmm4
124            movdqa      xmm7,    xmm5
125            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
126            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
127
128            movdqa      XMMWORD PTR [rdi +32], xmm4
129            movdqa      XMMWORD PTR [rdi +48], xmm6
130
131            add         rdi,            64
132            add         rax,            32
133            lea         rsi,            [rsi+rdx*2]
134
135            sub         rcx,            1
136            jnz         submby_loop
137
138    pop rdi
139    pop rsi
140    ; begin epilog
141    RESTORE_GOT
142    RESTORE_XMM
143    UNSHADOW_ARGS
144    pop         rbp
145    ret
146
147
148;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
149global sym(vp8_subtract_mbuv_sse2)
150sym(vp8_subtract_mbuv_sse2):
151    push        rbp
152    mov         rbp, rsp
153    SHADOW_ARGS_TO_STACK 5
154    GET_GOT     rbx
155    push rsi
156    push rdi
157    ; end prolog
158
159            mov     rdi,        arg(0) ;diff
160            mov     rax,        arg(3) ;pred
161            mov     rsi,        arg(1) ;z = usrc
162            add     rdi,        256*2  ;diff = diff + 256 (shorts)
163            add     rax,        256    ;Predictor = pred + 256
164            movsxd  rdx,        dword ptr arg(4) ;stride;
165            lea     rcx,        [rdx + rdx*2]
166
167            ;u
168            ;line 0 1
169            movq       xmm0,    MMWORD PTR [rsi]  ; src
170            movq       xmm2,    MMWORD PTR [rsi+rdx]
171            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
172            punpcklqdq xmm0,    xmm2
173
174            movdqa     xmm2,    xmm0
175            psubb      xmm0,    xmm1            ; subtraction with sign missed
176
177            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
178            pxor       xmm2,    [GLOBAL(t80)]
179            pcmpgtb    xmm1,    xmm2            ; obtain sign information
180
181            movdqa     xmm2,    xmm0
182            movdqa     xmm3,    xmm1
183            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
184            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
185
186            movdqa     XMMWORD PTR [rdi],   xmm0
187            movdqa     XMMWORD PTR [rdi +16],   xmm2
188
189            ;line 2 3
190            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
191            movq       xmm2,    MMWORD PTR [rsi+rcx]
192            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
193            punpcklqdq xmm0,    xmm2
194
195            movdqa     xmm2,    xmm0
196            psubb      xmm0,    xmm1            ; subtraction with sign missed
197
198            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
199            pxor       xmm2,    [GLOBAL(t80)]
200            pcmpgtb    xmm1,    xmm2            ; obtain sign information
201
202            movdqa     xmm2,    xmm0
203            movdqa     xmm3,    xmm1
204            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
205            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
206
207            movdqa     XMMWORD PTR [rdi + 32],   xmm0
208            movdqa     XMMWORD PTR [rdi + 48],   xmm2
209
210            ;line 4 5
211            lea        rsi,     [rsi + rdx*4]
212
213            movq       xmm0,    MMWORD PTR [rsi]  ; src
214            movq       xmm2,    MMWORD PTR [rsi+rdx]
215            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
216            punpcklqdq xmm0,    xmm2
217
218            movdqa     xmm2,    xmm0
219            psubb      xmm0,    xmm1            ; subtraction with sign missed
220
221            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
222            pxor       xmm2,    [GLOBAL(t80)]
223            pcmpgtb    xmm1,    xmm2            ; obtain sign information
224
225            movdqa     xmm2,    xmm0
226            movdqa     xmm3,    xmm1
227            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
228            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
229
230            movdqa     XMMWORD PTR [rdi + 64],   xmm0
231            movdqa     XMMWORD PTR [rdi + 80],   xmm2
232
233            ;line 6 7
234            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
235            movq       xmm2,    MMWORD PTR [rsi+rcx]
236            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
237            punpcklqdq xmm0,    xmm2
238
239            movdqa     xmm2,    xmm0
240            psubb      xmm0,    xmm1            ; subtraction with sign missed
241
242            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
243            pxor       xmm2,    [GLOBAL(t80)]
244            pcmpgtb    xmm1,    xmm2            ; obtain sign information
245
246            movdqa     xmm2,    xmm0
247            movdqa     xmm3,    xmm1
248            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
249            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
250
251            movdqa     XMMWORD PTR [rdi + 96],   xmm0
252            movdqa     XMMWORD PTR [rdi + 112],  xmm2
253
254            ;v
255            mov     rsi,        arg(2) ;z = vsrc
256            add     rdi,        64*2  ;diff = diff + 320 (shorts)
257            add     rax,        64    ;Predictor = pred + 320
258
259            ;line 0 1
260            movq       xmm0,    MMWORD PTR [rsi]  ; src
261            movq       xmm2,    MMWORD PTR [rsi+rdx]
262            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
263            punpcklqdq xmm0,    xmm2
264
265            movdqa     xmm2,    xmm0
266            psubb      xmm0,    xmm1            ; subtraction with sign missed
267
268            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
269            pxor       xmm2,    [GLOBAL(t80)]
270            pcmpgtb    xmm1,    xmm2            ; obtain sign information
271
272            movdqa     xmm2,    xmm0
273            movdqa     xmm3,    xmm1
274            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
275            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
276
277            movdqa     XMMWORD PTR [rdi],   xmm0
278            movdqa     XMMWORD PTR [rdi +16],   xmm2
279
280            ;line 2 3
281            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
282            movq       xmm2,    MMWORD PTR [rsi+rcx]
283            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
284            punpcklqdq xmm0,    xmm2
285
286            movdqa     xmm2,    xmm0
287            psubb      xmm0,    xmm1            ; subtraction with sign missed
288
289            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
290            pxor       xmm2,    [GLOBAL(t80)]
291            pcmpgtb    xmm1,    xmm2            ; obtain sign information
292
293            movdqa     xmm2,    xmm0
294            movdqa     xmm3,    xmm1
295            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
296            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
297
298            movdqa     XMMWORD PTR [rdi + 32],   xmm0
299            movdqa     XMMWORD PTR [rdi + 48],   xmm2
300
301            ;line 4 5
302            lea        rsi,     [rsi + rdx*4]
303
304            movq       xmm0,    MMWORD PTR [rsi]  ; src
305            movq       xmm2,    MMWORD PTR [rsi+rdx]
306            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
307            punpcklqdq xmm0,    xmm2
308
309            movdqa     xmm2,    xmm0
310            psubb      xmm0,    xmm1            ; subtraction with sign missed
311
312            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
313            pxor       xmm2,    [GLOBAL(t80)]
314            pcmpgtb    xmm1,    xmm2            ; obtain sign information
315
316            movdqa     xmm2,    xmm0
317            movdqa     xmm3,    xmm1
318            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
319            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
320
321            movdqa     XMMWORD PTR [rdi + 64],   xmm0
322            movdqa     XMMWORD PTR [rdi + 80],   xmm2
323
324            ;line 6 7
325            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
326            movq       xmm2,    MMWORD PTR [rsi+rcx]
327            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
328            punpcklqdq xmm0,    xmm2
329
330            movdqa     xmm2,    xmm0
331            psubb      xmm0,    xmm1            ; subtraction with sign missed
332
333            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
334            pxor       xmm2,    [GLOBAL(t80)]
335            pcmpgtb    xmm1,    xmm2            ; obtain sign information
336
337            movdqa     xmm2,    xmm0
338            movdqa     xmm3,    xmm1
339            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
340            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
341
342            movdqa     XMMWORD PTR [rdi + 96],   xmm0
343            movdqa     XMMWORD PTR [rdi + 112],  xmm2
344
345    ; begin epilog
346    pop rdi
347    pop rsi
348    RESTORE_GOT
349    UNSHADOW_ARGS
350    pop         rbp
351    ret
352
353SECTION_RODATA
354align 16
355t80:
356    times 16 db 0x80
357