179f15823c34ae1e423108295e416213200bb280fAndreas Huber;
279f15823c34ae1e423108295e416213200bb280fAndreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
379f15823c34ae1e423108295e416213200bb280fAndreas Huber;
479f15823c34ae1e423108295e416213200bb280fAndreas Huber;  Use of this source code is governed by a BSD-style license
579f15823c34ae1e423108295e416213200bb280fAndreas Huber;  that can be found in the LICENSE file in the root of the source
679f15823c34ae1e423108295e416213200bb280fAndreas Huber;  tree. An additional intellectual property rights grant can be found
779f15823c34ae1e423108295e416213200bb280fAndreas Huber;  in the file PATENTS.  All contributing project authors may
879f15823c34ae1e423108295e416213200bb280fAndreas Huber;  be found in the AUTHORS file in the root of the source tree.
979f15823c34ae1e423108295e416213200bb280fAndreas Huber;
1079f15823c34ae1e423108295e416213200bb280fAndreas Huber
1179f15823c34ae1e423108295e416213200bb280fAndreas Huber%include "vpx_ports/x86_abi_support.asm"
1279f15823c34ae1e423108295e416213200bb280fAndreas Huber
1379f15823c34ae1e423108295e416213200bb280fAndreas Huber; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
1479f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro TABULATE_SSIM 0
1579f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddusw         xmm15, xmm3  ; sum_s
1679f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddusw         xmm14, xmm4  ; sum_r
1779f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm1, xmm3
1879f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmaddwd         xmm1, xmm1
191b362b15af34006e6a11974088a46d42b903418eJohann        paddd           xmm13, xmm1 ; sum_sq_s
2079f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm2, xmm4
2179f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmaddwd         xmm2, xmm2
221b362b15af34006e6a11974088a46d42b903418eJohann        paddd           xmm12, xmm2 ; sum_sq_r
2379f15823c34ae1e423108295e416213200bb280fAndreas Huber        pmaddwd         xmm3, xmm4
241b362b15af34006e6a11974088a46d42b903418eJohann        paddd           xmm11, xmm3  ; sum_sxr
2579f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro
2679f15823c34ae1e423108295e416213200bb280fAndreas Huber
2779f15823c34ae1e423108295e416213200bb280fAndreas Huber; Sum across the register %1 starting with q words
2879f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro SUM_ACROSS_Q 1
2979f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm2,%1
3079f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckldq       %1,xmm0
3179f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhdq       xmm2,xmm0
3279f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddq           %1,xmm2
3379f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm2,%1
3479f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklqdq      %1,xmm0
3579f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhqdq      xmm2,xmm0
3679f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddq           %1,xmm2
3779f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro
3879f15823c34ae1e423108295e416213200bb280fAndreas Huber
3979f15823c34ae1e423108295e416213200bb280fAndreas Huber; Sum across the register %1 starting with q words
4079f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro SUM_ACROSS_W 1
4179f15823c34ae1e423108295e416213200bb280fAndreas Huber        movdqa          xmm1, %1
4279f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpcklwd       %1,xmm0
4379f15823c34ae1e423108295e416213200bb280fAndreas Huber        punpckhwd       xmm1,xmm0
4479f15823c34ae1e423108295e416213200bb280fAndreas Huber        paddd           %1, xmm1
4579f15823c34ae1e423108295e416213200bb280fAndreas Huber        SUM_ACROSS_Q    %1
4679f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro
471b362b15af34006e6a11974088a46d42b903418eJohann;void ssim_parms_sse2(
4879f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned char *s,
4979f15823c34ae1e423108295e416213200bb280fAndreas Huber;    int sp,
5079f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned char *r,
5179f15823c34ae1e423108295e416213200bb280fAndreas Huber;    int rp
5279f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_s,
5379f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_r,
5479f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sq_s,
5579f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sq_r,
5679f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sxr);
5779f15823c34ae1e423108295e416213200bb280fAndreas Huber;
5879f15823c34ae1e423108295e416213200bb280fAndreas Huber; TODO: Use parm passing through structure, probably don't need the pxors
5979f15823c34ae1e423108295e416213200bb280fAndreas Huber; ( calling app will initialize to 0 ) could easily fit everything in sse2
6079f15823c34ae1e423108295e416213200bb280fAndreas Huber; without too much hastle, and can probably do better estimates with psadw
6179f15823c34ae1e423108295e416213200bb280fAndreas Huber; or pavgb At this point this is just meant to be first pass for calculating
6279f15823c34ae1e423108295e416213200bb280fAndreas Huber; all the parms needed for 16x16 ssim so we can play with dssim as distortion
6379f15823c34ae1e423108295e416213200bb280fAndreas Huber; in mode selection code.
641b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_ssim_parms_16x16_sse2) PRIVATE
651b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_ssim_parms_16x16_sse2):
6679f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rbp
6779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rbp, rsp
6879f15823c34ae1e423108295e416213200bb280fAndreas Huber    SHADOW_ARGS_TO_STACK 9
691b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 15
7079f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rsi
7179f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rdi
7279f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; end prolog
7379f15823c34ae1e423108295e416213200bb280fAndreas Huber
7479f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rsi,        arg(0) ;s
7579f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rcx,        arg(1) ;sp
7679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,        arg(2) ;r
7779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rax,        arg(3) ;rp
7879f15823c34ae1e423108295e416213200bb280fAndreas Huber
7979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm0, xmm0
8079f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm15,xmm15  ;sum_s
8179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm14,xmm14  ;sum_r
8279f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm13,xmm13  ;sum_sq_s
8379f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm12,xmm12  ;sum_sq_r
8479f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm11,xmm11  ;sum_sxr
8579f15823c34ae1e423108295e416213200bb280fAndreas Huber
8679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdx, 16      ;row counter
871b362b15af34006e6a11974088a46d42b903418eJohann.NextRow:
8879f15823c34ae1e423108295e416213200bb280fAndreas Huber
8979f15823c34ae1e423108295e416213200bb280fAndreas Huber    ;grab source and reference pixels
9079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqu          xmm5, [rsi]
9179f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqu          xmm6, [rdi]
9279f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm3, xmm5
9379f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm4, xmm6
9479f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpckhbw       xmm3, xmm0 ; high_s
9579f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpckhbw       xmm4, xmm0 ; high_r
9679f15823c34ae1e423108295e416213200bb280fAndreas Huber
9779f15823c34ae1e423108295e416213200bb280fAndreas Huber    TABULATE_SSIM
9879f15823c34ae1e423108295e416213200bb280fAndreas Huber
9979f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm3, xmm5
10079f15823c34ae1e423108295e416213200bb280fAndreas Huber    movdqa          xmm4, xmm6
10179f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklbw       xmm3, xmm0 ; low_s
10279f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklbw       xmm4, xmm0 ; low_r
10379f15823c34ae1e423108295e416213200bb280fAndreas Huber
10479f15823c34ae1e423108295e416213200bb280fAndreas Huber    TABULATE_SSIM
10579f15823c34ae1e423108295e416213200bb280fAndreas Huber
10679f15823c34ae1e423108295e416213200bb280fAndreas Huber    add             rsi, rcx   ; next s row
10779f15823c34ae1e423108295e416213200bb280fAndreas Huber    add             rdi, rax   ; next r row
10879f15823c34ae1e423108295e416213200bb280fAndreas Huber
10979f15823c34ae1e423108295e416213200bb280fAndreas Huber    dec             rdx        ; counter
1101b362b15af34006e6a11974088a46d42b903418eJohann    jnz .NextRow
11179f15823c34ae1e423108295e416213200bb280fAndreas Huber
11279f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_W    xmm15
11379f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_W    xmm14
11479f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm13
11579f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm12
11679f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm11
11779f15823c34ae1e423108295e416213200bb280fAndreas Huber
11879f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(4)
1191b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm15;
12079f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(5)
1211b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm14;
12279f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(6)
1231b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm13;
12479f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(7)
1251b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm12;
12679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(8)
1271b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm11;
12879f15823c34ae1e423108295e416213200bb280fAndreas Huber
12979f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; begin epilog
13079f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rdi
13179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rsi
1321b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_XMM
13379f15823c34ae1e423108295e416213200bb280fAndreas Huber    UNSHADOW_ARGS
13479f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rbp
13579f15823c34ae1e423108295e416213200bb280fAndreas Huber    ret
13679f15823c34ae1e423108295e416213200bb280fAndreas Huber
1371b362b15af34006e6a11974088a46d42b903418eJohann;void ssim_parms_sse2(
13879f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned char *s,
13979f15823c34ae1e423108295e416213200bb280fAndreas Huber;    int sp,
14079f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned char *r,
14179f15823c34ae1e423108295e416213200bb280fAndreas Huber;    int rp
14279f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_s,
14379f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_r,
14479f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sq_s,
14579f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sq_r,
14679f15823c34ae1e423108295e416213200bb280fAndreas Huber;    unsigned long *sum_sxr);
14779f15823c34ae1e423108295e416213200bb280fAndreas Huber;
14879f15823c34ae1e423108295e416213200bb280fAndreas Huber; TODO: Use parm passing through structure, probably don't need the pxors
14979f15823c34ae1e423108295e416213200bb280fAndreas Huber; ( calling app will initialize to 0 ) could easily fit everything in sse2
15079f15823c34ae1e423108295e416213200bb280fAndreas Huber; without too much hastle, and can probably do better estimates with psadw
15179f15823c34ae1e423108295e416213200bb280fAndreas Huber; or pavgb At this point this is just meant to be first pass for calculating
15279f15823c34ae1e423108295e416213200bb280fAndreas Huber; all the parms needed for 16x16 ssim so we can play with dssim as distortion
15379f15823c34ae1e423108295e416213200bb280fAndreas Huber; in mode selection code.
1541b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_ssim_parms_8x8_sse2) PRIVATE
1551b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_ssim_parms_8x8_sse2):
15679f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rbp
15779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov         rbp, rsp
15879f15823c34ae1e423108295e416213200bb280fAndreas Huber    SHADOW_ARGS_TO_STACK 9
1591b362b15af34006e6a11974088a46d42b903418eJohann    SAVE_XMM 15
16079f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rsi
16179f15823c34ae1e423108295e416213200bb280fAndreas Huber    push        rdi
16279f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; end prolog
16379f15823c34ae1e423108295e416213200bb280fAndreas Huber
16479f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rsi,        arg(0) ;s
16579f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rcx,        arg(1) ;sp
16679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,        arg(2) ;r
16779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rax,        arg(3) ;rp
16879f15823c34ae1e423108295e416213200bb280fAndreas Huber
16979f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm0, xmm0
17079f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm15,xmm15  ;sum_s
17179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm14,xmm14  ;sum_r
17279f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm13,xmm13  ;sum_sq_s
17379f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm12,xmm12  ;sum_sq_r
17479f15823c34ae1e423108295e416213200bb280fAndreas Huber    pxor            xmm11,xmm11  ;sum_sxr
17579f15823c34ae1e423108295e416213200bb280fAndreas Huber
17679f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdx, 8      ;row counter
1771b362b15af34006e6a11974088a46d42b903418eJohann.NextRow:
17879f15823c34ae1e423108295e416213200bb280fAndreas Huber
17979f15823c34ae1e423108295e416213200bb280fAndreas Huber    ;grab source and reference pixels
1801b362b15af34006e6a11974088a46d42b903418eJohann    movq            xmm3, [rsi]
1811b362b15af34006e6a11974088a46d42b903418eJohann    movq            xmm4, [rdi]
18279f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklbw       xmm3, xmm0 ; low_s
18379f15823c34ae1e423108295e416213200bb280fAndreas Huber    punpcklbw       xmm4, xmm0 ; low_r
18479f15823c34ae1e423108295e416213200bb280fAndreas Huber
18579f15823c34ae1e423108295e416213200bb280fAndreas Huber    TABULATE_SSIM
18679f15823c34ae1e423108295e416213200bb280fAndreas Huber
18779f15823c34ae1e423108295e416213200bb280fAndreas Huber    add             rsi, rcx   ; next s row
18879f15823c34ae1e423108295e416213200bb280fAndreas Huber    add             rdi, rax   ; next r row
18979f15823c34ae1e423108295e416213200bb280fAndreas Huber
19079f15823c34ae1e423108295e416213200bb280fAndreas Huber    dec             rdx        ; counter
1911b362b15af34006e6a11974088a46d42b903418eJohann    jnz .NextRow
19279f15823c34ae1e423108295e416213200bb280fAndreas Huber
19379f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_W    xmm15
19479f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_W    xmm14
19579f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm13
19679f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm12
19779f15823c34ae1e423108295e416213200bb280fAndreas Huber    SUM_ACROSS_Q    xmm11
19879f15823c34ae1e423108295e416213200bb280fAndreas Huber
19979f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(4)
2001b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm15;
20179f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(5)
2021b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm14;
20379f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(6)
2041b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm13;
20579f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(7)
2061b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm12;
20779f15823c34ae1e423108295e416213200bb280fAndreas Huber    mov             rdi,arg(8)
2081b362b15af34006e6a11974088a46d42b903418eJohann    movd            [rdi], xmm11;
20979f15823c34ae1e423108295e416213200bb280fAndreas Huber
21079f15823c34ae1e423108295e416213200bb280fAndreas Huber    ; begin epilog
21179f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rdi
21279f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rsi
2131b362b15af34006e6a11974088a46d42b903418eJohann    RESTORE_XMM
21479f15823c34ae1e423108295e416213200bb280fAndreas Huber    UNSHADOW_ARGS
21579f15823c34ae1e423108295e416213200bb280fAndreas Huber    pop         rbp
21679f15823c34ae1e423108295e416213200bb280fAndreas Huber    ret
217