190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/* 2f71323e297a928af368937089d3ed71239786f86Andreas Huber * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 4f71323e297a928af368937089d3ed71239786f86Andreas Huber * Use of this source code is governed by a BSD-style license 5f71323e297a928af368937089d3ed71239786f86Andreas Huber * that can be found in the LICENSE file in the root of the source 6f71323e297a928af368937089d3ed71239786f86Andreas Huber * tree. An additional intellectual property rights grant can be found 7f71323e297a928af368937089d3ed71239786f86Andreas Huber * in the file PATENTS. All contributing project authors may 8f71323e297a928af368937089d3ed71239786f86Andreas Huber * be found in the AUTHORS file in the root of the source tree. 990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber */ 1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "memory.h" 1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "preproc.h" 1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "pragmas.h" 1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/**************************************************************************** 1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber* Macros 1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber****************************************************************************/ 1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#define FRAMECOUNT 7 2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) ) 2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/**************************************************************************** 2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber* Imports 2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber****************************************************************************/ 2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberextern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled); 2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/**************************************************************************** 2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber* Exported Global Variables 2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber****************************************************************************/ 3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength); 3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/**************************************************************************** 3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * ROUTINE : temp_filter_wmt 3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. 3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * unsigned char *s : Pointer to source frame. 3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * unsigned char *d : Pointer to destination frame. 3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * int bytes : Number of bytes to filter. 4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * int strength : Strength of filter to apply. 4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * OUTPUTS : None. 4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * RETURNS : void 4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * FUNCTION : Performs a closesness adjusted temporarl blur 4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * SPECIAL NOTES : Destination frame can be same as source frame. 4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/ 5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid temp_filter_wmt 5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber( 5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pre_proc_instance *ppi, 5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber unsigned char *s, 5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber unsigned char *d, 5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int bytes, 5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int strength 5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber) 5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{ 6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int byte = 0; 6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber unsigned char *frameptr = ppi->frame_buffer; 6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3, 3, 3, 3, 3}; 6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16}; 6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber if (ppi->frame == 0) 6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber do 6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int i; 7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int frame = 0; 7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber do 7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber for (i = 0; i < 8; i++) 7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *frameptr = s[byte+i]; 7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ++frameptr; 7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ++frame; 8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber while (frame < FRAMECOUNT); 8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber for (i = 0; i < 8; i++) 8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber d[byte+i] = s[byte+i]; 8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber byte += 8; 8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber while (byte < bytes); 9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber else 9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int i; 9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int offset2 = (ppi->frame % FRAMECOUNT); 9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber do 9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __declspec(align(16)) unsigned short counts[8]; 10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __declspec(align(16)) unsigned short sums[8]; 10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __asm 10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov eax, offset2 10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov edi, s // source pixels 10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm1, xmm1 // accumulator 10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm7, xmm7 10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov esi, frameptr // accumulator 11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor xmm2, xmm2 // count 11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm3, QWORD PTR [edi] 11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq QWORD PTR [esi+8*eax], xmm3 11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm3, xmm2 // xmm3 source pixels 11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov ecx, FRAMECOUNT 11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber next_frame: 12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq xmm4, QWORD PTR [esi] // get frame buffer values 12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw xmm4, xmm7 // xmm4 frame buffer pixels 12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm6, xmm4 // save the pixel values 12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw xmm4, xmm3 // subtracted pixel values 12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw xmm4, xmm4 // square xmm4 12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd xmm5, strength 12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw xmm4, xmm5 // should be strength 12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw xmm4, threes // 3 * modifier 12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm5, sixteens // 16s 13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusw xmm5, xmm4 // 16 - modifiers 13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa xmm4, xmm5 // save the modifiers 13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw xmm4, xmm6 // multiplier values 13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusw xmm1, xmm4 // accumulator 13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusw xmm2, xmm5 // count 13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add esi, 8 // next frame 13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec ecx // next set of eight pixels 13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz next_frame 13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa counts, xmm2 14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw xmm2, 1 // divide count by 2 for rounding 14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusw xmm1, xmm2 // rounding added in 14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov frameptr, esi 14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movdqa sums, xmm1 14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber for (i = 0; i < 8; i++) 14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; 15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber blurvalue >>= 16; 15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber d[i] = blurvalue; 15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber s += 8; 15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber d += 8; 15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber byte += 8; 15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber while (byte < bytes); 16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ++ppi->frame; 16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __asm emms 16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber} 16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/**************************************************************************** 16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * ROUTINE : temp_filter_mmx 16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * INPUTS : pre_proc_instance *ppi : Pointer to pre-processor instance. 17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * unsigned char *s : Pointer to source frame. 17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * unsigned char *d : Pointer to destination frame. 17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * int bytes : Number of bytes to filter. 17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * int strength : Strength of filter to apply. 17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * OUTPUTS : None. 17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * RETURNS : void 17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * FUNCTION : Performs a closesness adjusted temporarl blur 18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * SPECIAL NOTES : Destination frame can be same as source frame. 18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber * 18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/ 18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid temp_filter_mmx 18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber( 18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pre_proc_instance *ppi, 18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber unsigned char *s, 18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber unsigned char *d, 19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int bytes, 19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int strength 19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber) 19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{ 19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int byte = 0; 19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber unsigned char *frameptr = ppi->frame_buffer; 19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __declspec(align(16)) unsigned short threes[] = { 3, 3, 3, 3}; 19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16}; 19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber if (ppi->frame == 0) 20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber do 20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int i; 20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int frame = 0; 20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber do 20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber for (i = 0; i < 4; i++) 21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *frameptr = s[byte+i]; 21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ++frameptr; 21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ++frame; 21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber while (frame < FRAMECOUNT); 21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber for (i = 0; i < 4; i++) 22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber d[byte+i] = s[byte+i]; 22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber byte += 4; 22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber while (byte < bytes); 22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber else 22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int i; 23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int offset2 = (ppi->frame % FRAMECOUNT); 23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber do 23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __declspec(align(16)) unsigned short counts[8]; 23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __declspec(align(16)) unsigned short sums[8]; 23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __asm 23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov eax, offset2 24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov edi, s // source pixels 24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm1, mm1 // accumulator 24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm7, mm7 24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov esi, frameptr // accumulator 24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pxor mm2, mm2 // count 24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm3, DWORD PTR [edi] 24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd DWORD PTR [esi+4*eax], mm3 24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm3, mm2 // mm3 source pixels 25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov ecx, FRAMECOUNT 25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber next_frame: 25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm4, DWORD PTR [esi] // get frame buffer values 25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber punpcklbw mm4, mm7 // mm4 frame buffer pixels 25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm6, mm4 // save the pixel values 25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubsw mm4, mm3 // subtracted pixel values 25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, mm4 // square mm4 25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movd mm5, strength 26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm4, mm5 // should be strength 26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, threes // 3 * modifier 26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm5, sixteens // 16s 26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psubusw mm5, mm4 // 16 - modifiers 26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq mm4, mm5 // save the modifiers 26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber pmullw mm4, mm6 // multiplier values 26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusw mm1, mm4 // accumulator 26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusw mm2, mm5 // count 26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber add esi, 4 // next frame 26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber dec ecx // next set of eight pixels 27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber jnz next_frame 27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq counts, mm2 27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber psrlw mm2, 1 // divide count by 2 for rounding 27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber paddusw mm1, mm2 // rounding added in 27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber mov frameptr, esi 27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber movq sums, mm1 27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber for (i = 0; i < 4; i++) 28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber { 28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber int blurvalue = sums[i] * ppi->fixed_divide[counts[i]]; 28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber blurvalue >>= 16; 28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber d[i] = blurvalue; 28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber s += 4; 29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber d += 4; 29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber byte += 4; 29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber while (byte < bytes); 29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber } 29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber 29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ++ppi->frame; 29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber __asm emms 29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber} 299