1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14%define VP8_FILTER_WEIGHT 128
15%define VP8_FILTER_SHIFT  7
16
17;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
18;                             int pitch, int rows, int cols,int flimit)
19extern sym(vp8_rv)
20global sym(vp8_mbpost_proc_down_mmx) PRIVATE
21sym(vp8_mbpost_proc_down_mmx):
22    push        rbp
23    mov         rbp, rsp
24    SHADOW_ARGS_TO_STACK 5
25    GET_GOT     rbx
26    push        rsi
27    push        rdi
28    ; end prolog
29
30    ALIGN_STACK 16, rax
31    sub         rsp, 136
32
33    ; unsigned char d[16][8] at [rsp]
34    ; create flimit2 at [rsp+128]
35    mov         eax, dword ptr arg(4) ;flimit
36    mov         [rsp+128], eax
37    mov         [rsp+128+4], eax
38%define flimit2 [rsp+128]
39
40%if ABI_IS_32BIT=0
41    lea         r8,       [GLOBAL(sym(vp8_rv))]
42%endif
43
44    ;rows +=8;
45    add         dword ptr arg(2), 8
46
47    ;for(c=0; c<cols; c+=4)
48.loop_col:
49            mov         rsi,        arg(0)  ;s
50            pxor        mm0,        mm0     ;
51
52            movsxd      rax,        dword ptr arg(1) ;pitch       ;
53
54            ; this copies the last row down into the border 8 rows
55            mov         rdi,        rsi
56            mov         rdx,        arg(2)
57            sub         rdx,        9
58            imul        rdx,        rax
59            lea         rdi,        [rdi+rdx]
60            movq        mm1,        QWORD ptr[rdi]              ; first row
61            mov         rcx,        8
62.init_borderd                                                    ; initialize borders
63            lea         rdi,        [rdi + rax]
64            movq        [rdi],      mm1
65
66            dec         rcx
67            jne         .init_borderd
68
69            neg         rax                                     ; rax = -pitch
70
71            ; this copies the first row up into the border 8 rows
72            mov         rdi,        rsi
73            movq        mm1,        QWORD ptr[rdi]              ; first row
74            mov         rcx,        8
75.init_border                                                    ; initialize borders
76            lea         rdi,        [rdi + rax]
77            movq        [rdi],      mm1
78
79            dec         rcx
80            jne         .init_border
81
82
83            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
84            neg         rax
85
86
87            pxor        mm5,        mm5
88            pxor        mm6,        mm6     ;
89
90            pxor        mm7,        mm7     ;
91            mov         rdi,        rsi
92
93            mov         rcx,        15          ;
94
95.loop_initvar:
96            movd        mm1,        DWORD PTR [rdi];
97            punpcklbw   mm1,        mm0     ;
98
99            paddw       mm5,        mm1     ;
100            pmullw      mm1,        mm1     ;
101
102            movq        mm2,        mm1     ;
103            punpcklwd   mm1,        mm0     ;
104
105            punpckhwd   mm2,        mm0     ;
106            paddd       mm6,        mm1     ;
107
108            paddd       mm7,        mm2     ;
109            lea         rdi,        [rdi+rax]   ;
110
111            dec         rcx
112            jne         .loop_initvar
113            ;save the var and sum
114            xor         rdx,        rdx
115.loop_row:
116            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
117            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
118
119            punpcklbw   mm1,        mm0
120            punpcklbw   mm2,        mm0
121
122            paddw       mm5,        mm2
123            psubw       mm5,        mm1
124
125            pmullw      mm2,        mm2
126            movq        mm4,        mm2
127
128            punpcklwd   mm2,        mm0
129            punpckhwd   mm4,        mm0
130
131            paddd       mm6,        mm2
132            paddd       mm7,        mm4
133
134            pmullw      mm1,        mm1
135            movq        mm2,        mm1
136
137            punpcklwd   mm1,        mm0
138            psubd       mm6,        mm1
139
140            punpckhwd   mm2,        mm0
141            psubd       mm7,        mm2
142
143
144            movq        mm3,        mm6
145            pslld       mm3,        4
146
147            psubd       mm3,        mm6
148            movq        mm1,        mm5
149
150            movq        mm4,        mm5
151            pmullw      mm1,        mm1
152
153            pmulhw      mm4,        mm4
154            movq        mm2,        mm1
155
156            punpcklwd   mm1,        mm4
157            punpckhwd   mm2,        mm4
158
159            movq        mm4,        mm7
160            pslld       mm4,        4
161
162            psubd       mm4,        mm7
163
164            psubd       mm3,        mm1
165            psubd       mm4,        mm2
166
167            psubd       mm3,        flimit2
168            psubd       mm4,        flimit2
169
170            psrad       mm3,        31
171            psrad       mm4,        31
172
173            packssdw    mm3,        mm4
174            packsswb    mm3,        mm0
175
176            movd        mm1,        DWORD PTR [rsi+rax*8]
177
178            movq        mm2,        mm1
179            punpcklbw   mm1,        mm0
180
181            paddw       mm1,        mm5
182            mov         rcx,        rdx
183
184            and         rcx,        127
185%if ABI_IS_32BIT=1 && CONFIG_PIC=1
186            push        rax
187            lea         rax,        [GLOBAL(sym(vp8_rv))]
188            movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
189            pop         rax
190%elif ABI_IS_32BIT=0
191            movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
192%else
193            movq        mm4,        [sym(vp8_rv) + rcx*2]
194%endif
195            paddw       mm1,        mm4
196            psraw       mm1,        4
197
198            packuswb    mm1,        mm0
199            pand        mm1,        mm3
200
201            pandn       mm3,        mm2
202            por         mm1,        mm3
203
204            and         rcx,        15
205            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
206
207            cmp         edx,        8
208            jl          .skip_assignment
209
210            mov         rcx,        rdx
211            sub         rcx,        8
212            and         rcx,        15
213            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
214            movd        [rsi],      mm1
215
216.skip_assignment
217            lea         rsi,        [rsi+rax]
218
219            lea         rdi,        [rdi+rax]
220            add         rdx,        1
221
222            cmp         edx,        dword arg(2) ;rows
223            jl          .loop_row
224
225
226        add         dword arg(0), 4 ; s += 4
227        sub         dword arg(3), 4 ; cols -= 4
228        cmp         dword arg(3), 0
229        jg          .loop_col
230
231    add         rsp, 136
232    pop         rsp
233
234    ; begin epilog
235    pop rdi
236    pop rsi
237    RESTORE_GOT
238    UNSHADOW_ARGS
239    pop         rbp
240    ret
241%undef flimit2
242
243
244;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
245;                            unsigned char blackclamp[16],
246;                            unsigned char whiteclamp[16],
247;                            unsigned char bothclamp[16],
248;                            unsigned int Width, unsigned int Height, int Pitch)
249extern sym(rand)
250global sym(vp8_plane_add_noise_mmx) PRIVATE
251sym(vp8_plane_add_noise_mmx):
252    push        rbp
253    mov         rbp, rsp
254    SHADOW_ARGS_TO_STACK 8
255    GET_GOT     rbx
256    push        rsi
257    push        rdi
258    ; end prolog
259
260.addnoise_loop:
261    call sym(rand) WRT_PLT
262    mov     rcx, arg(1) ;noise
263    and     rax, 0xff
264    add     rcx, rax
265
266    ; we rely on the fact that the clamping vectors are stored contiguously
267    ; in black/white/both order. Note that we have to reload this here because
268    ; rdx could be trashed by rand()
269    mov     rdx, arg(2) ; blackclamp
270
271
272            mov     rdi, rcx
273            movsxd  rcx, dword arg(5) ;[Width]
274            mov     rsi, arg(0) ;Pos
275            xor         rax,rax
276
277.addnoise_nextset:
278            movq        mm1,[rsi+rax]         ; get the source
279
280            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
281            paddusb     mm1, [rdx+32] ;bothclamp
282            psubusb     mm1, [rdx+16] ;whiteclamp
283
284            movq        mm2,[rdi+rax]         ; get the noise for this line
285            paddb       mm1,mm2              ; add it in
286            movq        [rsi+rax],mm1         ; store the result
287
288            add         rax,8                 ; move to the next line
289
290            cmp         rax, rcx
291            jl          .addnoise_nextset
292
293    movsxd  rax, dword arg(7) ; Pitch
294    add     arg(0), rax ; Start += Pitch
295    sub     dword arg(6), 1   ; Height -= 1
296    jg      .addnoise_loop
297
298    ; begin epilog
299    pop rdi
300    pop rsi
301    RESTORE_GOT
302    UNSHADOW_ARGS
303    pop         rbp
304    ret
305
306
307SECTION_RODATA
308align 16
309Blur:
310    times 16 dw 16
311    times  8 dw 64
312    times 16 dw 16
313    times  8 dw  0
314
315rd:
316    times 4 dw 0x40
317