179f15823c34ae1e423108295e416213200bb280fAndreas Huber;
279f15823c34ae1e423108295e416213200bb280fAndreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
379f15823c34ae1e423108295e416213200bb280fAndreas Huber;
479f15823c34ae1e423108295e416213200bb280fAndreas Huber;  Use of this source code is governed by a BSD-style license
579f15823c34ae1e423108295e416213200bb280fAndreas Huber;  that can be found in the LICENSE file in the root of the source
679f15823c34ae1e423108295e416213200bb280fAndreas Huber;  tree. An additional intellectual property rights grant can be found
779f15823c34ae1e423108295e416213200bb280fAndreas Huber;  in the file PATENTS.  All contributing project authors may
879f15823c34ae1e423108295e416213200bb280fAndreas Huber;  be found in the AUTHORS file in the root of the source tree.
979f15823c34ae1e423108295e416213200bb280fAndreas Huber;
1079f15823c34ae1e423108295e416213200bb280fAndreas Huber
1179f15823c34ae1e423108295e416213200bb280fAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1279f15823c34ae1e423108295e416213200bb280fAndreas Huber
1379f15823c34ae1e423108295e416213200bb280fAndreas Huber; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
1479f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro TABULATE_SSIM 0
1579f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddusw         xmm15, xmm3  ; sum_s
1679f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddusw         xmm14, xmm4  ; sum_r
1779f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm1, xmm3
1879f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmaddwd         xmm1, xmm1
1979f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddq           xmm13, xmm1 ; sum_sq_s
2079f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm2, xmm4
2179f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmaddwd         xmm2, xmm2
2279f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddq           xmm12, xmm2 ; sum_sq_r
2379f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmaddwd         xmm3, xmm4
2479f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddq           xmm11, xmm3  ; sum_sxr
2579f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro
2679f15823c34ae1e423108295e416213200bb280fAndreas Huber
2779f15823c34ae1e423108295e416213200bb280fAndreas Huber; Sum across the register %1 starting with q words
2879f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro SUM_ACROSS_Q 1
2979f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm2,%1
3079f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckldq       %1,xmm0
3179f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhdq       xmm2,xmm0
3279f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddq           %1,xmm2
3379f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm2,%1
3479f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklqdq      %1,xmm0
3579f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhqdq      xmm2,xmm0
3679f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddq           %1,xmm2
3779f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro
3879f15823c34ae1e423108295e416213200bb280fAndreas Huber
3979f15823c34ae1e423108295e416213200bb280fAndreas Huber; Sum across the register %1 starting with q words
4079f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro SUM_ACROSS_W 1
4179f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm1, %1
4279f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklwd       %1,xmm0
4379f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhwd       xmm1,xmm0
4479f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddd           %1, xmm1
4579f15823c34ae1e423108295e416213200bb280fAndreas Huber        SUM_ACROSS_Q    %1
4679f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro
4779f15823c34ae1e423108295e416213200bb280fAndreas Huber;void ssim_parms_sse3(
4879f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned char *s,
4979f15823c34ae1e423108295e416213200bb280fAndreas Huber;    int sp,
5079f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned char *r,
5179f15823c34ae1e423108295e416213200bb280fAndreas Huber;    int rp
5279f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_s,
5379f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_r,
5479f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sq_s,
5579f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sq_r,
5679f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sxr);
5779f15823c34ae1e423108295e416213200bb280fAndreas Huber;
5879f15823c34ae1e423108295e416213200bb280fAndreas Huber; TODO: Use parm passing through structure, probably don't need the pxors
5979f15823c34ae1e423108295e416213200bb280fAndreas Huber; ( calling app will initialize to 0 ) could easily fit everything in sse2
6079f15823c34ae1e423108295e416213200bb280fAndreas Huber; without too much hastle, and can probably do better estimates with psadw
6179f15823c34ae1e423108295e416213200bb280fAndreas Huber; or pavgb At this point this is just meant to be first pass for calculating
6279f15823c34ae1e423108295e416213200bb280fAndreas Huber; all the parms needed for 16x16 ssim so we can play with dssim as distortion
6379f15823c34ae1e423108295e416213200bb280fAndreas Huber; in mode selection code.
6479f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_ssim_parms_16x16_sse3)
6579f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_ssim_parms_16x16_sse3):
6679f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rbp
6779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rbp, rsp
6879f15823c34ae1e423108295e416213200bb280fAndreas Huber    SHADOW_ARGS_TO_STACK 9
6979f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rsi
7079f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rdi
7179f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; end prolog
7279f15823c34ae1e423108295e416213200bb280fAndreas Huber
7379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rsi,        arg(0) ;s
7479f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rcx,        arg(1) ;sp
7579f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,        arg(2) ;r
7679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rax,        arg(3) ;rp
7779f15823c34ae1e423108295e416213200bb280fAndreas Huber
7879f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm0, xmm0
7979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm15,xmm15  ;sum_s
8079f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm14,xmm14  ;sum_r
8179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm13,xmm13  ;sum_sq_s
8279f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm12,xmm12  ;sum_sq_r
8379f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm11,xmm11  ;sum_sxr
8479f15823c34ae1e423108295e416213200bb280fAndreas Huber
8579f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdx, 16      ;row counter
8679f15823c34ae1e423108295e416213200bb280fAndreas HuberNextRow:
8779f15823c34ae1e423108295e416213200bb280fAndreas Huber
8879f15823c34ae1e423108295e416213200bb280fAndreas Huber    ;grab source and reference pixels
8979f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqu          xmm5, [rsi]
9079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqu          xmm6, [rdi]
9179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm3, xmm5
9279f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm4, xmm6
9379f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpckhbw       xmm3, xmm0 ; high_s
9479f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpckhbw       xmm4, xmm0 ; high_r
9579f15823c34ae1e423108295e416213200bb280fAndreas Huber
9679f15823c34ae1e423108295e416213200bb280fAndreas Huber    TABULATE_SSIM
9779f15823c34ae1e423108295e416213200bb280fAndreas Huber
9879f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm3, xmm5
9979f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm4, xmm6
10079f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklbw       xmm3, xmm0 ; low_s
10179f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklbw       xmm4, xmm0 ; low_r
10279f15823c34ae1e423108295e416213200bb280fAndreas Huber
10379f15823c34ae1e423108295e416213200bb280fAndreas Huber    TABULATE_SSIM
10479f15823c34ae1e423108295e416213200bb280fAndreas Huber
10579f15823c34ae1e423108295e416213200bb280fAndreas Huber    add             rsi, rcx   ; next s row
10679f15823c34ae1e423108295e416213200bb280fAndreas Huber    add             rdi, rax   ; next r row
10779f15823c34ae1e423108295e416213200bb280fAndreas Huber
10879f15823c34ae1e423108295e416213200bb280fAndreas Huber    dec             rdx        ; counter
10979f15823c34ae1e423108295e416213200bb280fAndreas Huber    jnz NextRow
11079f15823c34ae1e423108295e416213200bb280fAndreas Huber
11179f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_W    xmm15
11279f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_W    xmm14
11379f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm13
11479f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm12
11579f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm11
11679f15823c34ae1e423108295e416213200bb280fAndreas Huber
11779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(4)
11879f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm15;
11979f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(5)
12079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm14;
12179f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(6)
12279f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm13;
12379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(7)
12479f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm12;
12579f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(8)
12679f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm11;
12779f15823c34ae1e423108295e416213200bb280fAndreas Huber
12879f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; begin epilog
12979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rdi
13079f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rsi
13179f15823c34ae1e423108295e416213200bb280fAndreas Huber    UNSHADOW_ARGS
13279f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rbp
13379f15823c34ae1e423108295e416213200bb280fAndreas Huber    ret
13479f15823c34ae1e423108295e416213200bb280fAndreas Huber
13579f15823c34ae1e423108295e416213200bb280fAndreas Huber;void ssim_parms_sse3(
13679f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned char *s,
13779f15823c34ae1e423108295e416213200bb280fAndreas Huber;    int sp,
13879f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned char *r,
13979f15823c34ae1e423108295e416213200bb280fAndreas Huber;    int rp
14079f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_s,
14179f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_r,
14279f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sq_s,
14379f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sq_r,
14479f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sxr);
14579f15823c34ae1e423108295e416213200bb280fAndreas Huber;
14679f15823c34ae1e423108295e416213200bb280fAndreas Huber; TODO: Use parm passing through structure, probably don't need the pxors
14779f15823c34ae1e423108295e416213200bb280fAndreas Huber; ( calling app will initialize to 0 ) could easily fit everything in sse2
14879f15823c34ae1e423108295e416213200bb280fAndreas Huber; without too much hastle, and can probably do better estimates with psadw
14979f15823c34ae1e423108295e416213200bb280fAndreas Huber; or pavgb At this point this is just meant to be first pass for calculating
15079f15823c34ae1e423108295e416213200bb280fAndreas Huber; all the parms needed for 16x16 ssim so we can play with dssim as distortion
15179f15823c34ae1e423108295e416213200bb280fAndreas Huber; in mode selection code.
15279f15823c34ae1e423108295e416213200bb280fAndreas Huberglobal sym(vp8_ssim_parms_8x8_sse3)
15379f15823c34ae1e423108295e416213200bb280fAndreas Hubersym(vp8_ssim_parms_8x8_sse3):
15479f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rbp
15579f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rbp, rsp
15679f15823c34ae1e423108295e416213200bb280fAndreas Huber    SHADOW_ARGS_TO_STACK 9
15779f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rsi
15879f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rdi
15979f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; end prolog
16079f15823c34ae1e423108295e416213200bb280fAndreas Huber
16179f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rsi,        arg(0) ;s
16279f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rcx,        arg(1) ;sp
16379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,        arg(2) ;r
16479f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rax,        arg(3) ;rp
16579f15823c34ae1e423108295e416213200bb280fAndreas Huber
16679f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm0, xmm0
16779f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm15,xmm15  ;sum_s
16879f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm14,xmm14  ;sum_r
16979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm13,xmm13  ;sum_sq_s
17079f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm12,xmm12  ;sum_sq_r
17179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm11,xmm11  ;sum_sxr
17279f15823c34ae1e423108295e416213200bb280fAndreas Huber
17379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdx, 8      ;row counter
17479f15823c34ae1e423108295e416213200bb280fAndreas HuberNextRow2:
17579f15823c34ae1e423108295e416213200bb280fAndreas Huber
17679f15823c34ae1e423108295e416213200bb280fAndreas Huber    ;grab source and reference pixels
17779f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            xmm5, [rsi]
17879f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            xmm6, [rdi]
17979f15823c34ae1e423108295e416213200bb280fAndreas Huber
18079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm3, xmm5
18179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm4, xmm6
18279f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklbw       xmm3, xmm0 ; low_s
18379f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklbw       xmm4, xmm0 ; low_r
18479f15823c34ae1e423108295e416213200bb280fAndreas Huber
18579f15823c34ae1e423108295e416213200bb280fAndreas Huber    TABULATE_SSIM
18679f15823c34ae1e423108295e416213200bb280fAndreas Huber
18779f15823c34ae1e423108295e416213200bb280fAndreas Huber    add             rsi, rcx   ; next s row
18879f15823c34ae1e423108295e416213200bb280fAndreas Huber    add             rdi, rax   ; next r row
18979f15823c34ae1e423108295e416213200bb280fAndreas Huber
19079f15823c34ae1e423108295e416213200bb280fAndreas Huber    dec             rdx        ; counter
19179f15823c34ae1e423108295e416213200bb280fAndreas Huber    jnz NextRow2
19279f15823c34ae1e423108295e416213200bb280fAndreas Huber
19379f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_W    xmm15
19479f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_W    xmm14
19579f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm13
19679f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm12
19779f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm11
19879f15823c34ae1e423108295e416213200bb280fAndreas Huber
19979f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(4)
20079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm15;
20179f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(5)
20279f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm14;
20379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(6)
20479f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm13;
20579f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(7)
20679f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm12;
20779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(8)
20879f15823c34ae1e423108295e416213200bb280fAndreas Huber    movq            [rdi], xmm11;
20979f15823c34ae1e423108295e416213200bb280fAndreas Huber
21079f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; begin epilog
21179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rdi
21279f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rsi
21379f15823c34ae1e423108295e416213200bb280fAndreas Huber    UNSHADOW_ARGS
21479f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rbp
21579f15823c34ae1e423108295e416213200bb280fAndreas Huber    ret
216