1ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 2ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 4ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Use of this source code is governed by a BSD-style license 5ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; that can be found in the LICENSE file in the root of the source 6ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; tree. An additional intellectual property rights grant can be found 7ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; in the file PATENTS. All contributing project authors may 8ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; be found in the AUTHORS file in the root of the source tree. 9ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 10ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 11ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%include "vpx_ports/x86_abi_support.asm" 12ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 13ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 14ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro TABULATE_SSIM 0 15ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddusw xmm15, xmm3 ; sum_s 16ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddusw xmm14, xmm4 ; sum_r 17ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm1, xmm3 18ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm1, xmm1 19ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm13, xmm1 ; sum_sq_s 20ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm2, xmm4 21ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm2, xmm2 22ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm12, xmm2 ; sum_sq_r 23ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pmaddwd xmm3, xmm4 24ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd xmm11, xmm3 ; sum_sxr 25ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 26ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 27ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Sum across the register %1 starting with q words 28ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SUM_ACROSS_Q 1 29ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm2,%1 30ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckldq %1,xmm0 31ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhdq xmm2,xmm0 32ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddq %1,xmm2 33ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm2,%1 34ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklqdq %1,xmm0 35ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhqdq xmm2,xmm0 36ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddq %1,xmm2 37ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 38ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 39ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; Sum across the register %1 starting with q words 40ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%macro SUM_ACROSS_W 1 41ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm1, %1 42ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklwd %1,xmm0 43ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhwd xmm1,xmm0 44ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang paddd %1, xmm1 45ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_Q %1 46ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang%endmacro 47ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void ssim_parms_sse2( 48ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *s, 49ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int sp, 50ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *r, 51ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int rp 52ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_s, 53ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_r, 54ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_sq_s, 55ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_sq_r, 56ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_sxr); 57ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 58ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; TODO: Use parm passing through structure, probably don't need the pxors 59ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; ( calling app will initialize to 0 ) could easily fit everything in sse2 60ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; without too much hastle, and can probably do better estimates with psadw 61ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; or pavgb At this point this is just meant to be first pass for calculating 62ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; all the parms needed for 16x16 ssim so we can play with dssim as distortion 63ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; in mode selection code. 64ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_ssim_parms_16x16_sse2) PRIVATE 65ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_ssim_parms_16x16_sse2): 66ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 67ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 68ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 9 69ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SAVE_XMM 15 70ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 71ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 72ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 73ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 74ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(0) ;s 75ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rcx, arg(1) ;sp 76ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(2) ;r 77ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(3) ;rp 78ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 79ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm0, xmm0 80ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm15,xmm15 ;sum_s 81ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm14,xmm14 ;sum_r 82ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm13,xmm13 ;sum_sq_s 83ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm12,xmm12 ;sum_sq_r 84ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm11,xmm11 ;sum_sxr 85ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 86ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdx, 16 ;row counter 87ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.NextRow: 88ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 89ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ;grab source and reference pixels 90ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqu xmm5, [rsi] 91ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqu xmm6, [rdi] 92ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm3, xmm5 93ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm4, xmm6 94ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw xmm3, xmm0 ; high_s 95ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpckhbw xmm4, xmm0 ; high_r 96ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 97ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TABULATE_SSIM 98ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 99ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm3, xmm5 100ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movdqa xmm4, xmm6 101ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 ; low_s 102ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm4, xmm0 ; low_r 103ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 104ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TABULATE_SSIM 105ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 106ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsi, rcx ; next s row 107ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdi, rax ; next r row 108ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 109ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dec rdx ; counter 110ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang jnz .NextRow 111ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 112ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_W xmm15 113ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_W xmm14 114ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_Q xmm13 115ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_Q xmm12 116ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_Q xmm11 117ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 118ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(4) 119ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm15; 120ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(5) 121ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm14; 122ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(6) 123ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm13; 124ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(7) 125ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm12; 126ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(8) 127ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm11; 128ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 129ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 130ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 131ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 132ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_XMM 133ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 134ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 135ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 136ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 137ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang;void ssim_parms_sse2( 138ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *s, 139ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int sp, 140ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned char *r, 141ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; int rp 142ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_s, 143ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_r, 144ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_sq_s, 145ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_sq_r, 146ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; unsigned long *sum_sxr); 147ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; 148ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; TODO: Use parm passing through structure, probably don't need the pxors 149ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; ( calling app will initialize to 0 ) could easily fit everything in sse2 150ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; without too much hastle, and can probably do better estimates with psadw 151ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; or pavgb At this point this is just meant to be first pass for calculating 152ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; all the parms needed for 16x16 ssim so we can play with dssim as distortion 153ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang; in mode selection code. 154ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangglobal sym(vp9_ssim_parms_8x8_sse2) PRIVATE 155ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuangsym(vp9_ssim_parms_8x8_sse2): 156ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rbp 157ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rbp, rsp 158ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SHADOW_ARGS_TO_STACK 9 159ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SAVE_XMM 15 160ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rsi 161ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang push rdi 162ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; end prolog 163ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 164ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rsi, arg(0) ;s 165ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rcx, arg(1) ;sp 166ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi, arg(2) ;r 167ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rax, arg(3) ;rp 168ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 169ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm0, xmm0 170ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm15,xmm15 ;sum_s 171ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm14,xmm14 ;sum_r 172ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm13,xmm13 ;sum_sq_s 173ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm12,xmm12 ;sum_sq_r 174ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pxor xmm11,xmm11 ;sum_sxr 175ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 176ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdx, 8 ;row counter 177ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang.NextRow: 178ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 179ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ;grab source and reference pixels 180ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm3, [rsi] 181ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movq xmm4, [rdi] 182ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm3, xmm0 ; low_s 183ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang punpcklbw xmm4, xmm0 ; low_r 184ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 185ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang TABULATE_SSIM 186ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 187ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rsi, rcx ; next s row 188ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang add rdi, rax ; next r row 189ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 190ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang dec rdx ; counter 191ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang jnz .NextRow 192ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 193ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_W xmm15 194ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_W xmm14 195ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_Q xmm13 196ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_Q xmm12 197ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang SUM_ACROSS_Q xmm11 198ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 199ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(4) 200ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm15; 201ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(5) 202ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm14; 203ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(6) 204ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm13; 205ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(7) 206ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm12; 207ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang mov rdi,arg(8) 208ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang movd [rdi], xmm11; 209ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang 210ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ; begin epilog 211ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rdi 212ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rsi 213ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang RESTORE_XMM 214ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang UNSHADOW_ARGS 215ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang pop rbp 216ba164dffc5a6795bce97fae02b51ccf3330e15e4hkuang ret 217