179f15823c34ae1e423108295e416213200bb280fAndreas Huber; 279f15823c34ae1e423108295e416213200bb280fAndreas Huber; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 379f15823c34ae1e423108295e416213200bb280fAndreas Huber; 479f15823c34ae1e423108295e416213200bb280fAndreas Huber; Use of this source code is governed by a BSD-style license 579f15823c34ae1e423108295e416213200bb280fAndreas Huber; that can be found in the LICENSE file in the root of the source 679f15823c34ae1e423108295e416213200bb280fAndreas Huber; tree. An additional intellectual property rights grant can be found 779f15823c34ae1e423108295e416213200bb280fAndreas Huber; in the file PATENTS. All contributing project authors may 879f15823c34ae1e423108295e416213200bb280fAndreas Huber; be found in the AUTHORS file in the root of the source tree. 979f15823c34ae1e423108295e416213200bb280fAndreas Huber; 1079f15823c34ae1e423108295e416213200bb280fAndreas Huber 1179f15823c34ae1e423108295e416213200bb280fAndreas Huber%include "vpx_ports/x86_abi_support.asm" 1279f15823c34ae1e423108295e416213200bb280fAndreas Huber 1379f15823c34ae1e423108295e416213200bb280fAndreas Huber; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr 1479f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro TABULATE_SSIM 0 1579f15823c34ae1e423108295e416213200bb280fAndreas Huber paddusw xmm15, xmm3 ; sum_s 1679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddusw xmm14, xmm4 ; sum_r 1779f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, xmm3 1879f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm1, xmm1 191b362b15af34006e6a11974088a46d42b903418eJohann paddd xmm13, xmm1 ; sum_sq_s 2079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2, xmm4 2179f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm2, xmm2 221b362b15af34006e6a11974088a46d42b903418eJohann paddd xmm12, xmm2 ; sum_sq_r 2379f15823c34ae1e423108295e416213200bb280fAndreas Huber pmaddwd xmm3, xmm4 241b362b15af34006e6a11974088a46d42b903418eJohann paddd xmm11, xmm3 ; sum_sxr 2579f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 2679f15823c34ae1e423108295e416213200bb280fAndreas Huber 2779f15823c34ae1e423108295e416213200bb280fAndreas Huber; Sum across the register %1 starting with q words 2879f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro SUM_ACROSS_Q 1 2979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2,%1 3079f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckldq %1,xmm0 3179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhdq xmm2,xmm0 3279f15823c34ae1e423108295e416213200bb280fAndreas Huber paddq %1,xmm2 3379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm2,%1 3479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklqdq %1,xmm0 3579f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhqdq xmm2,xmm0 3679f15823c34ae1e423108295e416213200bb280fAndreas Huber paddq %1,xmm2 3779f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 3879f15823c34ae1e423108295e416213200bb280fAndreas Huber 3979f15823c34ae1e423108295e416213200bb280fAndreas Huber; Sum across the register %1 starting with q words 4079f15823c34ae1e423108295e416213200bb280fAndreas Huber%macro SUM_ACROSS_W 1 4179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm1, %1 4279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklwd %1,xmm0 4379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhwd xmm1,xmm0 4479f15823c34ae1e423108295e416213200bb280fAndreas Huber paddd %1, xmm1 4579f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q %1 4679f15823c34ae1e423108295e416213200bb280fAndreas Huber%endmacro 471b362b15af34006e6a11974088a46d42b903418eJohann;void ssim_parms_sse2( 4879f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *s, 4979f15823c34ae1e423108295e416213200bb280fAndreas Huber; int sp, 5079f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *r, 5179f15823c34ae1e423108295e416213200bb280fAndreas Huber; int rp 5279f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_s, 5379f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_r, 5479f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sq_s, 5579f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sq_r, 5679f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sxr); 5779f15823c34ae1e423108295e416213200bb280fAndreas Huber; 5879f15823c34ae1e423108295e416213200bb280fAndreas Huber; TODO: Use parm passing through structure, probably don't need the pxors 5979f15823c34ae1e423108295e416213200bb280fAndreas Huber; ( calling app will initialize to 0 ) could easily fit everything in sse2 6079f15823c34ae1e423108295e416213200bb280fAndreas Huber; without too much hastle, and can probably do better estimates with psadw 6179f15823c34ae1e423108295e416213200bb280fAndreas Huber; or pavgb At this point this is just meant to be first pass for calculating 6279f15823c34ae1e423108295e416213200bb280fAndreas Huber; all the parms needed for 16x16 ssim so we can play with dssim as distortion 6379f15823c34ae1e423108295e416213200bb280fAndreas Huber; in mode selection code. 641b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_ssim_parms_16x16_sse2) PRIVATE 651b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_ssim_parms_16x16_sse2): 6679f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbp 6779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, rsp 6879f15823c34ae1e423108295e416213200bb280fAndreas Huber SHADOW_ARGS_TO_STACK 9 691b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 15 7079f15823c34ae1e423108295e416213200bb280fAndreas Huber push rsi 7179f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 7279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 7379f15823c34ae1e423108295e416213200bb280fAndreas Huber 7479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;s 7579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rcx, arg(1) ;sp 7679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;r 7779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rax, arg(3) ;rp 7879f15823c34ae1e423108295e416213200bb280fAndreas Huber 7979f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 8079f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm15,xmm15 ;sum_s 8179f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm14,xmm14 ;sum_r 8279f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm13,xmm13 ;sum_sq_s 8379f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm12,xmm12 ;sum_sq_r 8479f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm11,xmm11 ;sum_sxr 8579f15823c34ae1e423108295e416213200bb280fAndreas Huber 8679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, 16 ;row counter 871b362b15af34006e6a11974088a46d42b903418eJohann.NextRow: 8879f15823c34ae1e423108295e416213200bb280fAndreas Huber 8979f15823c34ae1e423108295e416213200bb280fAndreas Huber ;grab source and reference pixels 9079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm5, [rsi] 9179f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqu xmm6, [rdi] 9279f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm5 9379f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm6 9479f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm3, xmm0 ; high_s 9579f15823c34ae1e423108295e416213200bb280fAndreas Huber punpckhbw xmm4, xmm0 ; high_r 9679f15823c34ae1e423108295e416213200bb280fAndreas Huber 9779f15823c34ae1e423108295e416213200bb280fAndreas Huber TABULATE_SSIM 9879f15823c34ae1e423108295e416213200bb280fAndreas Huber 9979f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm3, xmm5 10079f15823c34ae1e423108295e416213200bb280fAndreas Huber movdqa xmm4, xmm6 10179f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; low_s 10279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm4, xmm0 ; low_r 10379f15823c34ae1e423108295e416213200bb280fAndreas Huber 10479f15823c34ae1e423108295e416213200bb280fAndreas Huber TABULATE_SSIM 10579f15823c34ae1e423108295e416213200bb280fAndreas Huber 10679f15823c34ae1e423108295e416213200bb280fAndreas Huber add rsi, rcx ; next s row 10779f15823c34ae1e423108295e416213200bb280fAndreas Huber add rdi, rax ; next r row 10879f15823c34ae1e423108295e416213200bb280fAndreas Huber 10979f15823c34ae1e423108295e416213200bb280fAndreas Huber dec rdx ; counter 1101b362b15af34006e6a11974088a46d42b903418eJohann jnz .NextRow 11179f15823c34ae1e423108295e416213200bb280fAndreas Huber 11279f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_W xmm15 11379f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_W xmm14 11479f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm13 11579f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm12 11679f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm11 11779f15823c34ae1e423108295e416213200bb280fAndreas Huber 11879f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(4) 1191b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm15; 12079f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(5) 1211b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm14; 12279f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(6) 1231b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm13; 12479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(7) 1251b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm12; 12679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(8) 1271b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm11; 12879f15823c34ae1e423108295e416213200bb280fAndreas Huber 12979f15823c34ae1e423108295e416213200bb280fAndreas Huber ; begin epilog 13079f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 13179f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 1321b362b15af34006e6a11974088a46d42b903418eJohann RESTORE_XMM 13379f15823c34ae1e423108295e416213200bb280fAndreas Huber UNSHADOW_ARGS 13479f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 13579f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 13679f15823c34ae1e423108295e416213200bb280fAndreas Huber 1371b362b15af34006e6a11974088a46d42b903418eJohann;void ssim_parms_sse2( 13879f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *s, 13979f15823c34ae1e423108295e416213200bb280fAndreas Huber; int sp, 14079f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned char *r, 14179f15823c34ae1e423108295e416213200bb280fAndreas Huber; int rp 14279f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_s, 14379f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_r, 14479f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sq_s, 14579f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sq_r, 14679f15823c34ae1e423108295e416213200bb280fAndreas Huber; unsigned long *sum_sxr); 14779f15823c34ae1e423108295e416213200bb280fAndreas Huber; 14879f15823c34ae1e423108295e416213200bb280fAndreas Huber; TODO: Use parm passing through structure, probably don't need the pxors 14979f15823c34ae1e423108295e416213200bb280fAndreas Huber; ( calling app will initialize to 0 ) could easily fit everything in sse2 15079f15823c34ae1e423108295e416213200bb280fAndreas Huber; without too much hastle, and can probably do better estimates with psadw 15179f15823c34ae1e423108295e416213200bb280fAndreas Huber; or pavgb At this point this is just meant to be first pass for calculating 15279f15823c34ae1e423108295e416213200bb280fAndreas Huber; all the parms needed for 16x16 ssim so we can play with dssim as distortion 15379f15823c34ae1e423108295e416213200bb280fAndreas Huber; in mode selection code. 1541b362b15af34006e6a11974088a46d42b903418eJohannglobal sym(vp8_ssim_parms_8x8_sse2) PRIVATE 1551b362b15af34006e6a11974088a46d42b903418eJohannsym(vp8_ssim_parms_8x8_sse2): 15679f15823c34ae1e423108295e416213200bb280fAndreas Huber push rbp 15779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rbp, rsp 15879f15823c34ae1e423108295e416213200bb280fAndreas Huber SHADOW_ARGS_TO_STACK 9 1591b362b15af34006e6a11974088a46d42b903418eJohann SAVE_XMM 15 16079f15823c34ae1e423108295e416213200bb280fAndreas Huber push rsi 16179f15823c34ae1e423108295e416213200bb280fAndreas Huber push rdi 16279f15823c34ae1e423108295e416213200bb280fAndreas Huber ; end prolog 16379f15823c34ae1e423108295e416213200bb280fAndreas Huber 16479f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rsi, arg(0) ;s 16579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rcx, arg(1) ;sp 16679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi, arg(2) ;r 16779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rax, arg(3) ;rp 16879f15823c34ae1e423108295e416213200bb280fAndreas Huber 16979f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm0, xmm0 17079f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm15,xmm15 ;sum_s 17179f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm14,xmm14 ;sum_r 17279f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm13,xmm13 ;sum_sq_s 17379f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm12,xmm12 ;sum_sq_r 17479f15823c34ae1e423108295e416213200bb280fAndreas Huber pxor xmm11,xmm11 ;sum_sxr 17579f15823c34ae1e423108295e416213200bb280fAndreas Huber 17679f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdx, 8 ;row counter 1771b362b15af34006e6a11974088a46d42b903418eJohann.NextRow: 17879f15823c34ae1e423108295e416213200bb280fAndreas Huber 17979f15823c34ae1e423108295e416213200bb280fAndreas Huber ;grab source and reference pixels 1801b362b15af34006e6a11974088a46d42b903418eJohann movq xmm3, [rsi] 1811b362b15af34006e6a11974088a46d42b903418eJohann movq xmm4, [rdi] 18279f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm3, xmm0 ; low_s 18379f15823c34ae1e423108295e416213200bb280fAndreas Huber punpcklbw xmm4, xmm0 ; low_r 18479f15823c34ae1e423108295e416213200bb280fAndreas Huber 18579f15823c34ae1e423108295e416213200bb280fAndreas Huber TABULATE_SSIM 18679f15823c34ae1e423108295e416213200bb280fAndreas Huber 18779f15823c34ae1e423108295e416213200bb280fAndreas Huber add rsi, rcx ; next s row 18879f15823c34ae1e423108295e416213200bb280fAndreas Huber add rdi, rax ; next r row 18979f15823c34ae1e423108295e416213200bb280fAndreas Huber 19079f15823c34ae1e423108295e416213200bb280fAndreas Huber dec rdx ; counter 1911b362b15af34006e6a11974088a46d42b903418eJohann jnz .NextRow 19279f15823c34ae1e423108295e416213200bb280fAndreas Huber 19379f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_W xmm15 19479f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_W xmm14 19579f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm13 19679f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm12 19779f15823c34ae1e423108295e416213200bb280fAndreas Huber SUM_ACROSS_Q xmm11 19879f15823c34ae1e423108295e416213200bb280fAndreas Huber 19979f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(4) 2001b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm15; 20179f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(5) 2021b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm14; 20379f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(6) 2041b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm13; 20579f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(7) 2061b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm12; 20779f15823c34ae1e423108295e416213200bb280fAndreas Huber mov rdi,arg(8) 2081b362b15af34006e6a11974088a46d42b903418eJohann movd [rdi], xmm11; 20979f15823c34ae1e423108295e416213200bb280fAndreas Huber 21079f15823c34ae1e423108295e416213200bb280fAndreas Huber ; begin epilog 21179f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rdi 21279f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rsi 2131b362b15af34006e6a11974088a46d42b903418eJohann RESTORE_XMM 21479f15823c34ae1e423108295e416213200bb280fAndreas Huber UNSHADOW_ARGS 21579f15823c34ae1e423108295e416213200bb280fAndreas Huber pop rbp 21679f15823c34ae1e423108295e416213200bb280fAndreas Huber ret 217