1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define VP8_FILTER_WEIGHT 128
15%define VP8_FILTER_SHIFT  7
16
17;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
18;                             int pitch, int rows, int cols,int flimit)
19extern sym(vp8_rv)
20global sym(vp8_mbpost_proc_down_mmx) PRIVATE
21sym(vp8_mbpost_proc_down_mmx):
22    push        rbp
23    mov         rbp, rsp
24    SHADOW_ARGS_TO_STACK 5
25    GET_GOT     rbx
26    push        rsi
27    push        rdi
28    ; end prolog
29
30    ALIGN_STACK 16, rax
31    sub         rsp, 136
32
33    ; unsigned char d[16][8] at [rsp]
34    ; create flimit2 at [rsp+128]
35    mov         eax, dword ptr arg(4) ;flimit
36    mov         [rsp+128], eax
37    mov         [rsp+128+4], eax
38%define flimit2 [rsp+128]
39
40%if ABI_IS_32BIT=0
41    lea         r8,       [GLOBAL(sym(vp8_rv))]
42%endif
43
44    ;rows +=8;
45    add         dword ptr arg(2), 8
46
47    ;for(c=0; c<cols; c+=4)
48.loop_col:
49            mov         rsi,        arg(0)  ;s
50            pxor        mm0,        mm0     ;
51
52            movsxd      rax,        dword ptr arg(1) ;pitch       ;
53
54            ; this copies the last row down into the border 8 rows
55            mov         rdi,        rsi
56            mov         rdx,        arg(2)
57            sub         rdx,        9
58            imul        rdx,        rax
59            lea         rdi,        [rdi+rdx]
60            movq        mm1,        QWORD ptr[rdi]              ; first row
61            mov         rcx,        8
62.init_borderd                                                    ; initialize borders
63            lea         rdi,        [rdi + rax]
64            movq        [rdi],      mm1
65
66            dec         rcx
67            jne         .init_borderd
68
69            neg         rax                                     ; rax = -pitch
70
71            ; this copies the first row up into the border 8 rows
72            mov         rdi,        rsi
73            movq        mm1,        QWORD ptr[rdi]              ; first row
74            mov         rcx,        8
75.init_border                                                    ; initialize borders
76            lea         rdi,        [rdi + rax]
77            movq        [rdi],      mm1
78
79            dec         rcx
80            jne         .init_border
81
82
83            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
84            neg         rax
85
86
87            pxor        mm5,        mm5
88            pxor        mm6,        mm6     ;
89
90            pxor        mm7,        mm7     ;
91            mov         rdi,        rsi
92
93            mov         rcx,        15          ;
94
95.loop_initvar:
96            movd        mm1,        DWORD PTR [rdi];
97            punpcklbw   mm1,        mm0     ;
98
99            paddw       mm5,        mm1     ;
100            pmullw      mm1,        mm1     ;
101
102            movq        mm2,        mm1     ;
103            punpcklwd   mm1,        mm0     ;
104
105            punpckhwd   mm2,        mm0     ;
106            paddd       mm6,        mm1     ;
107
108            paddd       mm7,        mm2     ;
109            lea         rdi,        [rdi+rax]   ;
110
111            dec         rcx
112            jne         .loop_initvar
113            ;save the var and sum
114            xor         rdx,        rdx
115.loop_row:
116            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
117            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
118
119            punpcklbw   mm1,        mm0
120            punpcklbw   mm2,        mm0
121
122            paddw       mm5,        mm2
123            psubw       mm5,        mm1
124
125            pmullw      mm2,        mm2
126            movq        mm4,        mm2
127
128            punpcklwd   mm2,        mm0
129            punpckhwd   mm4,        mm0
130
131            paddd       mm6,        mm2
132            paddd       mm7,        mm4
133
134            pmullw      mm1,        mm1
135            movq        mm2,        mm1
136
137            punpcklwd   mm1,        mm0
138            psubd       mm6,        mm1
139
140            punpckhwd   mm2,        mm0
141            psubd       mm7,        mm2
142
143
144            movq        mm3,        mm6
145            pslld       mm3,        4
146
147            psubd       mm3,        mm6
148            movq        mm1,        mm5
149
150            movq        mm4,        mm5
151            pmullw      mm1,        mm1
152
153            pmulhw      mm4,        mm4
154            movq        mm2,        mm1
155
156            punpcklwd   mm1,        mm4
157            punpckhwd   mm2,        mm4
158
159            movq        mm4,        mm7
160            pslld       mm4,        4
161
162            psubd       mm4,        mm7
163
164            psubd       mm3,        mm1
165            psubd       mm4,        mm2
166
167            psubd       mm3,        flimit2
168            psubd       mm4,        flimit2
169
170            psrad       mm3,        31
171            psrad       mm4,        31
172
173            packssdw    mm3,        mm4
174            packsswb    mm3,        mm0
175
176            movd        mm1,        DWORD PTR [rsi+rax*8]
177
178            movq        mm2,        mm1
179            punpcklbw   mm1,        mm0
180
181            paddw       mm1,        mm5
182            mov         rcx,        rdx
183
184            and         rcx,        127
185%if ABI_IS_32BIT=1 && CONFIG_PIC=1
186            push        rax
187            lea         rax,        [GLOBAL(sym(vp8_rv))]
188            movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
189            pop         rax
190%elif ABI_IS_32BIT=0
191            movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
192%else
193            movq        mm4,        [sym(vp8_rv) + rcx*2]
194%endif
195            paddw       mm1,        mm4
196            psraw       mm1,        4
197
198            packuswb    mm1,        mm0
199            pand        mm1,        mm3
200
201            pandn       mm3,        mm2
202            por         mm1,        mm3
203
204            and         rcx,        15
205            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
206
207            cmp         edx,        8
208            jl          .skip_assignment
209
210            mov         rcx,        rdx
211            sub         rcx,        8
212            and         rcx,        15
213            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
214            movd        [rsi],      mm1
215
216.skip_assignment
217            lea         rsi,        [rsi+rax]
218
219            lea         rdi,        [rdi+rax]
220            add         rdx,        1
221
222            cmp         edx,        dword arg(2) ;rows
223            jl          .loop_row
224
225
226        add         dword arg(0), 4 ; s += 4
227        sub         dword arg(3), 4 ; cols -= 4
228        cmp         dword arg(3), 0
229        jg          .loop_col
230
231    add         rsp, 136
232    pop         rsp
233
234    ; begin epilog
235    pop rdi
236    pop rsi
237    RESTORE_GOT
238    UNSHADOW_ARGS
239    pop         rbp
240    ret
241%undef flimit2
242
243
244;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
245;                            unsigned char blackclamp[16],
246;                            unsigned char whiteclamp[16],
247;                            unsigned char bothclamp[16],
248;                            unsigned int Width, unsigned int Height, int Pitch)
249global sym(vp8_plane_add_noise_mmx) PRIVATE
250sym(vp8_plane_add_noise_mmx):
251    push        rbp
252    mov         rbp, rsp
253    SHADOW_ARGS_TO_STACK 8
254    GET_GOT     rbx
255    push        rsi
256    push        rdi
257    ; end prolog
258
259.addnoise_loop:
260    call sym(LIBVPX_RAND) WRT_PLT
261    mov     rcx, arg(1) ;noise
262    and     rax, 0xff
263    add     rcx, rax
264
265    ; we rely on the fact that the clamping vectors are stored contiguously
266    ; in black/white/both order. Note that we have to reload this here because
267    ; rdx could be trashed by rand()
268    mov     rdx, arg(2) ; blackclamp
269
270
271            mov     rdi, rcx
272            movsxd  rcx, dword arg(5) ;[Width]
273            mov     rsi, arg(0) ;Pos
274            xor         rax,rax
275
276.addnoise_nextset:
277            movq        mm1,[rsi+rax]         ; get the source
278
279            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
280            paddusb     mm1, [rdx+32] ;bothclamp
281            psubusb     mm1, [rdx+16] ;whiteclamp
282
283            movq        mm2,[rdi+rax]         ; get the noise for this line
284            paddb       mm1,mm2              ; add it in
285            movq        [rsi+rax],mm1         ; store the result
286
287            add         rax,8                 ; move to the next line
288
289            cmp         rax, rcx
290            jl          .addnoise_nextset
291
292    movsxd  rax, dword arg(7) ; Pitch
293    add     arg(0), rax ; Start += Pitch
294    sub     dword arg(6), 1   ; Height -= 1
295    jg      .addnoise_loop
296
297    ; begin epilog
298    pop rdi
299    pop rsi
300    RESTORE_GOT
301    UNSHADOW_ARGS
302    pop         rbp
303    ret
304
305
306SECTION_RODATA
307align 16
308Blur:
309    times 16 dw 16
310    times  8 dw 64
311    times 16 dw 16
312    times  8 dw  0
313
314rd:
315    times 4 dw 0x40
316