1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;macro in deblock functions
15%macro FIRST_2_ROWS 0
16        movdqa      xmm4,       xmm0
17        movdqa      xmm6,       xmm0
18        movdqa      xmm5,       xmm1
19        pavgb       xmm5,       xmm3
20
21        ;calculate absolute value
22        psubusb     xmm4,       xmm1
23        psubusb     xmm1,       xmm0
24        psubusb     xmm6,       xmm3
25        psubusb     xmm3,       xmm0
26        paddusb     xmm4,       xmm1
27        paddusb     xmm6,       xmm3
28
29        ;get threshold
30        movdqa      xmm2,       flimit
31        pxor        xmm1,       xmm1
32        movdqa      xmm7,       xmm2
33
34        ;get mask
35        psubusb     xmm2,       xmm4
36        psubusb     xmm7,       xmm6
37        pcmpeqb     xmm2,       xmm1
38        pcmpeqb     xmm7,       xmm1
39        por         xmm7,       xmm2
40%endmacro
41
42%macro SECOND_2_ROWS 0
43        movdqa      xmm6,       xmm0
44        movdqa      xmm4,       xmm0
45        movdqa      xmm2,       xmm1
46        pavgb       xmm1,       xmm3
47
48        ;calculate absolute value
49        psubusb     xmm6,       xmm2
50        psubusb     xmm2,       xmm0
51        psubusb     xmm4,       xmm3
52        psubusb     xmm3,       xmm0
53        paddusb     xmm6,       xmm2
54        paddusb     xmm4,       xmm3
55
56        pavgb       xmm5,       xmm1
57
58        ;get threshold
59        movdqa      xmm2,       flimit
60        pxor        xmm1,       xmm1
61        movdqa      xmm3,       xmm2
62
63        ;get mask
64        psubusb     xmm2,       xmm6
65        psubusb     xmm3,       xmm4
66        pcmpeqb     xmm2,       xmm1
67        pcmpeqb     xmm3,       xmm1
68
69        por         xmm7,       xmm2
70        por         xmm7,       xmm3
71
72        pavgb       xmm5,       xmm0
73
74        ;decide if or not to use filtered value
75        pand        xmm0,       xmm7
76        pandn       xmm7,       xmm5
77        paddusb     xmm0,       xmm7
78%endmacro
79
80%macro UPDATE_FLIMIT 0
81        movdqu      xmm2,       XMMWORD PTR [rbx]
82        movdqu      [rsp],      xmm2
83        add         rbx,        16
84%endmacro
85
86SECTION .text
87
88;void vpx_post_proc_down_and_across_mb_row_sse2
89;(
90;    unsigned char *src_ptr,
91;    unsigned char *dst_ptr,
92;    int src_pixels_per_line,
93;    int dst_pixels_per_line,
94;    int cols,
95;    int *flimits,
96;    int size
97;)
98global sym(vpx_post_proc_down_and_across_mb_row_sse2) PRIVATE
99sym(vpx_post_proc_down_and_across_mb_row_sse2):
100    push        rbp
101    mov         rbp, rsp
102    SHADOW_ARGS_TO_STACK 7
103    SAVE_XMM 7
104    push        rbx
105    push        rsi
106    push        rdi
107    ; end prolog
108    ALIGN_STACK 16, rax
109    sub         rsp, 16
110
111        ; put flimit on stack
112        mov         rbx,        arg(5)           ;flimits ptr
113        UPDATE_FLIMIT
114
115%define flimit [rsp]
116
117        mov         rsi,        arg(0)           ;src_ptr
118        mov         rdi,        arg(1)           ;dst_ptr
119
120        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
121        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
122.nextrow:
123        xor         rdx,        rdx              ;col
124.nextcol:
125        ;load current and next 2 rows
126        movdqu      xmm0,       XMMWORD PTR [rsi]
127        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
128        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
129
130        FIRST_2_ROWS
131
132        ;load above 2 rows
133        neg         rax
134        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
135        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
136
137        SECOND_2_ROWS
138
139        movdqu      XMMWORD PTR [rdi], xmm0
140
141        neg         rax                          ; positive stride
142        add         rsi,        16
143        add         rdi,        16
144
145        add         rdx,        16
146        cmp         edx,        dword arg(4)     ;cols
147        jge         .downdone
148        UPDATE_FLIMIT
149        jmp         .nextcol
150
151.downdone:
152        ; done with the all cols, start the across filtering in place
153        sub         rsi,        rdx
154        sub         rdi,        rdx
155
156        mov         rbx,        arg(5) ; flimits
157        UPDATE_FLIMIT
158
159        ; dup the first byte into the left border 8 times
160        movq        mm1,   [rdi]
161        punpcklbw   mm1,   mm1
162        punpcklwd   mm1,   mm1
163        punpckldq   mm1,   mm1
164        mov         rdx,    -8
165        movq        [rdi+rdx], mm1
166
167        ; dup the last byte into the right border
168        movsxd      rdx,    dword arg(4)
169        movq        mm1,   [rdi + rdx + -1]
170        punpcklbw   mm1,   mm1
171        punpcklwd   mm1,   mm1
172        punpckldq   mm1,   mm1
173        movq        [rdi+rdx], mm1
174
175        xor         rdx,        rdx
176        movq        mm0,        QWORD PTR [rdi-16];
177        movq        mm1,        QWORD PTR [rdi-8];
178
179.acrossnextcol:
180        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
181        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
182        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
183
184        FIRST_2_ROWS
185
186        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
187        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
188
189        SECOND_2_ROWS
190
191        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
192        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
193        movdq2q     mm0,        xmm0
194        psrldq      xmm0,       8
195        movdq2q     mm1,        xmm0
196
197        add         rdx,        16
198        cmp         edx,        dword arg(4)     ;cols
199        jge         .acrossdone
200        UPDATE_FLIMIT
201        jmp         .acrossnextcol
202
203.acrossdone:
204        ; last 16 pixels
205        movq        QWORD PTR [rdi+rdx-16], mm0
206
207        cmp         edx,        dword arg(4)
208        jne         .throw_last_8
209        movq        QWORD PTR [rdi+rdx-8], mm1
210.throw_last_8:
211        ; done with this rwo
212        add         rsi,rax                      ;next src line
213        mov         eax, dword arg(3)            ;dst_pixels_per_line
214        add         rdi,rax                      ;next destination
215        mov         eax, dword arg(2)            ;src_pixels_per_line
216
217        mov         rbx,        arg(5)           ;flimits
218        UPDATE_FLIMIT
219
220        dec         rcx                          ;decrement count
221        jnz         .nextrow                     ;next row
222
223    add rsp, 16
224    pop rsp
225    ; begin epilog
226    pop rdi
227    pop rsi
228    pop rbx
229    RESTORE_XMM
230    UNSHADOW_ARGS
231    pop         rbp
232    ret
233%undef flimit
234
235;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
236;                               int pitch, int rows, int cols,int flimit)
237extern sym(vpx_rv)
238global sym(vpx_mbpost_proc_down_sse2) PRIVATE
239sym(vpx_mbpost_proc_down_sse2):
240    push        rbp
241    mov         rbp, rsp
242    SHADOW_ARGS_TO_STACK 5
243    SAVE_XMM 7
244    GET_GOT     rbx
245    push        rsi
246    push        rdi
247    ; end prolog
248
249    ALIGN_STACK 16, rax
250    sub         rsp, 128+16
251
252    ; unsigned char d[16][8] at [rsp]
253    ; create flimit2 at [rsp+128]
254    mov         eax, dword ptr arg(4) ;flimit
255    mov         [rsp+128], eax
256    mov         [rsp+128+4], eax
257    mov         [rsp+128+8], eax
258    mov         [rsp+128+12], eax
259%define flimit4 [rsp+128]
260
261%if ABI_IS_32BIT=0
262    lea         r8,       [GLOBAL(sym(vpx_rv))]
263%endif
264
265    ;rows +=8;
266    add         dword arg(2), 8
267
268    ;for(c=0; c<cols; c+=8)
269.loop_col:
270            mov         rsi,        arg(0) ; s
271            pxor        xmm0,       xmm0        ;
272
273            movsxd      rax,        dword ptr arg(1) ;pitch       ;
274
275            ; this copies the last row down into the border 8 rows
276            mov         rdi,        rsi
277            mov         rdx,        arg(2)
278            sub         rdx,        9
279            imul        rdx,        rax
280            lea         rdi,        [rdi+rdx]
281            movq        xmm1,       QWORD ptr[rdi]              ; first row
282            mov         rcx,        8
283.init_borderd:                                                  ; initialize borders
284            lea         rdi,        [rdi + rax]
285            movq        [rdi],      xmm1
286
287            dec         rcx
288            jne         .init_borderd
289
290            neg         rax                                     ; rax = -pitch
291
292            ; this copies the first row up into the border 8 rows
293            mov         rdi,        rsi
294            movq        xmm1,       QWORD ptr[rdi]              ; first row
295            mov         rcx,        8
296.init_border:                                                   ; initialize borders
297            lea         rdi,        [rdi + rax]
298            movq        [rdi],      xmm1
299
300            dec         rcx
301            jne         .init_border
302
303
304
305            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
306            neg         rax
307
308            pxor        xmm5,       xmm5
309            pxor        xmm6,       xmm6        ;
310
311            pxor        xmm7,       xmm7        ;
312            mov         rdi,        rsi
313
314            mov         rcx,        15          ;
315
316.loop_initvar:
317            movq        xmm1,       QWORD PTR [rdi];
318            punpcklbw   xmm1,       xmm0        ;
319
320            paddw       xmm5,       xmm1        ;
321            pmullw      xmm1,       xmm1        ;
322
323            movdqa      xmm2,       xmm1        ;
324            punpcklwd   xmm1,       xmm0        ;
325
326            punpckhwd   xmm2,       xmm0        ;
327            paddd       xmm6,       xmm1        ;
328
329            paddd       xmm7,       xmm2        ;
330            lea         rdi,        [rdi+rax]   ;
331
332            dec         rcx
333            jne         .loop_initvar
334            ;save the var and sum
335            xor         rdx,        rdx
336.loop_row:
337            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
338            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
339
340            punpcklbw   xmm1,       xmm0
341            punpcklbw   xmm2,       xmm0
342
343            paddw       xmm5,       xmm2
344            psubw       xmm5,       xmm1
345
346            pmullw      xmm2,       xmm2
347            movdqa      xmm4,       xmm2
348
349            punpcklwd   xmm2,       xmm0
350            punpckhwd   xmm4,       xmm0
351
352            paddd       xmm6,       xmm2
353            paddd       xmm7,       xmm4
354
355            pmullw      xmm1,       xmm1
356            movdqa      xmm2,       xmm1
357
358            punpcklwd   xmm1,       xmm0
359            psubd       xmm6,       xmm1
360
361            punpckhwd   xmm2,       xmm0
362            psubd       xmm7,       xmm2
363
364
365            movdqa      xmm3,       xmm6
366            pslld       xmm3,       4
367
368            psubd       xmm3,       xmm6
369            movdqa      xmm1,       xmm5
370
371            movdqa      xmm4,       xmm5
372            pmullw      xmm1,       xmm1
373
374            pmulhw      xmm4,       xmm4
375            movdqa      xmm2,       xmm1
376
377            punpcklwd   xmm1,       xmm4
378            punpckhwd   xmm2,       xmm4
379
380            movdqa      xmm4,       xmm7
381            pslld       xmm4,       4
382
383            psubd       xmm4,       xmm7
384
385            psubd       xmm3,       xmm1
386            psubd       xmm4,       xmm2
387
388            psubd       xmm3,       flimit4
389            psubd       xmm4,       flimit4
390
391            psrad       xmm3,       31
392            psrad       xmm4,       31
393
394            packssdw    xmm3,       xmm4
395            packsswb    xmm3,       xmm0
396
397            movq        xmm1,       QWORD PTR [rsi+rax*8]
398
399            movq        xmm2,       xmm1
400            punpcklbw   xmm1,       xmm0
401
402            paddw       xmm1,       xmm5
403            mov         rcx,        rdx
404
405            and         rcx,        127
406%if ABI_IS_32BIT=1 && CONFIG_PIC=1
407            push        rax
408            lea         rax,        [GLOBAL(sym(vpx_rv))]
409            movdqu      xmm4,       [rax + rcx*2] ;vpx_rv[rcx*2]
410            pop         rax
411%elif ABI_IS_32BIT=0
412            movdqu      xmm4,       [r8 + rcx*2] ;vpx_rv[rcx*2]
413%else
414            movdqu      xmm4,       [sym(vpx_rv) + rcx*2]
415%endif
416
417            paddw       xmm1,       xmm4
418            ;paddw     xmm1,       eight8s
419            psraw       xmm1,       4
420
421            packuswb    xmm1,       xmm0
422            pand        xmm1,       xmm3
423
424            pandn       xmm3,       xmm2
425            por         xmm1,       xmm3
426
427            and         rcx,        15
428            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
429
430            cmp         edx,        8
431            jl          .skip_assignment
432
433            mov         rcx,        rdx
434            sub         rcx,        8
435            and         rcx,        15
436            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
437            movq        [rsi],      mm0
438
439.skip_assignment:
440            lea         rsi,        [rsi+rax]
441
442            lea         rdi,        [rdi+rax]
443            add         rdx,        1
444
445            cmp         edx,        dword arg(2) ;rows
446            jl          .loop_row
447
448        add         dword arg(0), 8 ; s += 8
449        sub         dword arg(3), 8 ; cols -= 8
450        cmp         dword arg(3), 0
451        jg          .loop_col
452
453    add         rsp, 128+16
454    pop         rsp
455
456    ; begin epilog
457    pop rdi
458    pop rsi
459    RESTORE_GOT
460    RESTORE_XMM
461    UNSHADOW_ARGS
462    pop         rbp
463    ret
464%undef flimit4
465
466
467;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
468;                                    int pitch, int rows, int cols,int flimit)
469global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE
470sym(vpx_mbpost_proc_across_ip_sse2):
471    push        rbp
472    mov         rbp, rsp
473    SHADOW_ARGS_TO_STACK 5
474    SAVE_XMM 7
475    GET_GOT     rbx
476    push        rsi
477    push        rdi
478    ; end prolog
479
480    ALIGN_STACK 16, rax
481    sub         rsp, 16
482
483    ; create flimit4 at [rsp]
484    mov         eax, dword ptr arg(4) ;flimit
485    mov         [rsp], eax
486    mov         [rsp+4], eax
487    mov         [rsp+8], eax
488    mov         [rsp+12], eax
489%define flimit4 [rsp]
490
491
492    ;for(r=0;r<rows;r++)
493.ip_row_loop:
494
495        xor         rdx,    rdx ;sumsq=0;
496        xor         rcx,    rcx ;sum=0;
497        mov         rsi,    arg(0); s
498
499
500        ; dup the first byte into the left border 8 times
501        movq        mm1,   [rsi]
502        punpcklbw   mm1,   mm1
503        punpcklwd   mm1,   mm1
504        punpckldq   mm1,   mm1
505
506        mov         rdi,    -8
507        movq        [rsi+rdi], mm1
508
509        ; dup the last byte into the right border
510        movsxd      rdx,    dword arg(3)
511        movq        mm1,   [rsi + rdx + -1]
512        punpcklbw   mm1,   mm1
513        punpcklwd   mm1,   mm1
514        punpckldq   mm1,   mm1
515        movq        [rsi+rdx], mm1
516
517.ip_var_loop:
518        ;for(i=-8;i<=6;i++)
519        ;{
520        ;    sumsq += s[i]*s[i];
521        ;    sum   += s[i];
522        ;}
523        movzx       eax, byte [rsi+rdi]
524        add         ecx, eax
525        mul         al
526        add         edx, eax
527        add         rdi, 1
528        cmp         rdi, 6
529        jle         .ip_var_loop
530
531
532            ;mov         rax,    sumsq
533            ;movd        xmm7,   rax
534            movd        xmm7,   edx
535
536            ;mov         rax,    sum
537            ;movd        xmm6,   rax
538            movd        xmm6,   ecx
539
540            mov         rsi,    arg(0) ;s
541            xor         rcx,    rcx
542
543            movsxd      rdx,    dword arg(3) ;cols
544            add         rdx,    8
545            pxor        mm0,    mm0
546            pxor        mm1,    mm1
547
548            pxor        xmm0,   xmm0
549.nextcol4:
550
551            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
552            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
553
554            punpcklbw   xmm1,   xmm0                    ; expanding
555            punpcklbw   xmm2,   xmm0                    ; expanding
556
557            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
558            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
559
560            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
561            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
562
563            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
564            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
565
566            paddd       xmm6,   xmm2
567            paddd       xmm7,   xmm1
568
569            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
570            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
571
572            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
573            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
574
575            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
576            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
577
578            paddd       xmm6,   xmm4
579            paddd       xmm7,   xmm3
580
581            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
582            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
583
584            paddd       xmm7,   xmm3
585            paddd       xmm6,   xmm4
586
587            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
588            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
589
590            paddd       xmm7,   xmm3
591            paddd       xmm6,   xmm4
592
593            movdqa      xmm3,   xmm6
594            pmaddwd     xmm3,   xmm3
595
596            movdqa      xmm5,   xmm7
597            pslld       xmm5,   4
598
599            psubd       xmm5,   xmm7
600            psubd       xmm5,   xmm3
601
602            psubd       xmm5,   flimit4
603            psrad       xmm5,   31
604
605            packssdw    xmm5,   xmm0
606            packsswb    xmm5,   xmm0
607
608            movd        xmm1,   DWORD PTR [rsi+rcx]
609            movq        xmm2,   xmm1
610
611            punpcklbw   xmm1,   xmm0
612            punpcklwd   xmm1,   xmm0
613
614            paddd       xmm1,   xmm6
615            paddd       xmm1,   [GLOBAL(four8s)]
616
617            psrad       xmm1,   4
618            packssdw    xmm1,   xmm0
619
620            packuswb    xmm1,   xmm0
621            pand        xmm1,   xmm5
622
623            pandn       xmm5,   xmm2
624            por         xmm5,   xmm1
625
626            movd        [rsi+rcx-8],  mm0
627            movq        mm0,    mm1
628
629            movdq2q     mm1,    xmm5
630            psrldq      xmm7,   12
631
632            psrldq      xmm6,   12
633            add         rcx,    4
634
635            cmp         rcx,    rdx
636            jl          .nextcol4
637
638        ;s+=pitch;
639        movsxd rax, dword arg(1)
640        add    arg(0), rax
641
642        sub dword arg(2), 1 ;rows-=1
643        cmp dword arg(2), 0
644        jg .ip_row_loop
645
646    add         rsp, 16
647    pop         rsp
648
649    ; begin epilog
650    pop rdi
651    pop rsi
652    RESTORE_GOT
653    RESTORE_XMM
654    UNSHADOW_ARGS
655    pop         rbp
656    ret
657%undef flimit4
658
659
660SECTION_RODATA
661align 16
662four8s:
663    times 4 dd 8
664