190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/*
2f71323e297a928af368937089d3ed71239786f86Andreas Huber *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
4f71323e297a928af368937089d3ed71239786f86Andreas Huber *  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber *  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber *  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber *  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber *  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber */
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "memory.h"
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "preproc.h"
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "pragmas.h"
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*  Macros
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber****************************************************************************/
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#define FRAMECOUNT 7
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*  Imports
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber****************************************************************************/
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberextern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*  Exported Global Variables
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber****************************************************************************/
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : temp_filter_wmt
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *s     : Pointer to source frame.
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *d     : Pointer to destination frame.
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  int bytes            : Number of bytes to filter.
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  int strength         : Strength of filter to apply.
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : Performs a closesness adjusted temporarl blur
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : Destination frame can be same as source frame.
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid temp_filter_wmt
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pre_proc_instance *ppi,
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *s,
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *d,
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    int bytes,
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    int strength
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    int byte = 0;
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *frameptr = ppi->frame_buffer;
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    if (ppi->frame == 0)
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        do
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        {
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            int i;
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            int frame = 0;
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            do
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            {
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                for (i = 0; i < 8; i++)
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                {
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                    *frameptr = s[byte+i];
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                    ++frameptr;
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                }
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                ++frame;
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            }
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            while (frame < FRAMECOUNT);
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            for (i = 0; i < 8; i++)
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                d[byte+i] = s[byte+i];
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            byte += 8;
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        }
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        while (byte < bytes);
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    else
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        int i;
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        int offset2 = (ppi->frame % FRAMECOUNT);
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        do
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        {
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            __declspec(align(16)) unsigned short counts[8];
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            __declspec(align(16)) unsigned short sums[8];
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            __asm
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            {
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         eax, offset2
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         edi, s                  // source pixels
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pxor        xmm1, xmm1              // accumulator
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pxor        xmm7, xmm7
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         esi, frameptr           // accumulator
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pxor        xmm2, xmm2              // count
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movq        xmm3, QWORD PTR [edi]
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movq        QWORD PTR [esi+8*eax], xmm3
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                punpcklbw   xmm3, xmm2              // xmm3 source pixels
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         ecx,  FRAMECOUNT
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                next_frame:
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movdqa      xmm6, xmm4              // save the pixel values
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                psubsw      xmm4, xmm3              // subtracted pixel values
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pmullw      xmm4, xmm4              // square xmm4
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movd        xmm5, strength
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                psrlw       xmm4, xmm5              // should be strength
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pmullw      xmm4, threes            // 3 * modifier
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movdqa      xmm5, sixteens          // 16s
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                psubusw     xmm5, xmm4              // 16 - modifiers
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movdqa      xmm4, xmm5              // save the modifiers
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pmullw      xmm4, xmm6              // multiplier values
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                paddusw     xmm1, xmm4              // accumulator
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                paddusw     xmm2, xmm5              // count
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                add         esi, 8                  // next frame
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                dec         ecx                     // next set of eight pixels
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                jnz         next_frame
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movdqa      counts, xmm2
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                psrlw       xmm2, 1                 // divide count by 2 for rounding
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                paddusw     xmm1, xmm2              // rounding added in
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         frameptr, esi
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movdqa      sums, xmm1
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            }
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            for (i = 0; i < 8; i++)
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            {
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                blurvalue >>= 16;
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                d[i] = blurvalue;
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            }
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            s += 8;
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            d += 8;
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            byte += 8;
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        }
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        while (byte < bytes);
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ++ppi->frame;
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm emms
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : temp_filter_mmx
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *s     : Pointer to source frame.
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *d     : Pointer to destination frame.
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  int bytes            : Number of bytes to filter.
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  int strength         : Strength of filter to apply.
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : Performs a closesness adjusted temporarl blur
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : Destination frame can be same as source frame.
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid temp_filter_mmx
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    pre_proc_instance *ppi,
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *s,
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *d,
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    int bytes,
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    int strength
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    int byte = 0;
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *frameptr = ppi->frame_buffer;
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    if (ppi->frame == 0)
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        do
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        {
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            int i;
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            int frame = 0;
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            do
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            {
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                for (i = 0; i < 4; i++)
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                {
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                    *frameptr = s[byte+i];
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                    ++frameptr;
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                }
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                ++frame;
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            }
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            while (frame < FRAMECOUNT);
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            for (i = 0; i < 4; i++)
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                d[byte+i] = s[byte+i];
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            byte += 4;
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        }
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        while (byte < bytes);
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    else
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        int i;
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        int offset2 = (ppi->frame % FRAMECOUNT);
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        do
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        {
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            __declspec(align(16)) unsigned short counts[8];
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            __declspec(align(16)) unsigned short sums[8];
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            __asm
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            {
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         eax, offset2
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         edi, s                  // source pixels
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pxor        mm1, mm1                // accumulator
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pxor        mm7, mm7
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         esi, frameptr           // accumulator
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pxor        mm2, mm2                // count
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movd        mm3, DWORD PTR [edi]
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movd        DWORD PTR [esi+4*eax], mm3
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                punpcklbw   mm3, mm2                // mm3 source pixels
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         ecx,  FRAMECOUNT
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                next_frame:
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movd        mm4, DWORD PTR [esi]    // get frame buffer values
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movq        mm6, mm4                // save the pixel values
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                psubsw      mm4, mm3                // subtracted pixel values
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pmullw      mm4, mm4                // square mm4
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movd        mm5, strength
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                psrlw       mm4, mm5                // should be strength
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pmullw      mm4, threes             // 3 * modifier
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movq        mm5, sixteens           // 16s
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                psubusw     mm5, mm4                // 16 - modifiers
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movq        mm4, mm5                // save the modifiers
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                pmullw      mm4, mm6                // multiplier values
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                paddusw     mm1, mm4                // accumulator
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                paddusw     mm2, mm5                // count
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                add         esi, 4                  // next frame
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                dec         ecx                     // next set of eight pixels
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                jnz         next_frame
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movq        counts, mm2
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                psrlw       mm2, 1                  // divide count by 2 for rounding
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                paddusw     mm1, mm2                // rounding added in
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                mov         frameptr, esi
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                movq        sums, mm1
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            }
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            for (i = 0; i < 4; i++)
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            {
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                blurvalue >>= 16;
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber                d[i] = blurvalue;
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            }
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            s += 4;
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            d += 4;
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber            byte += 4;
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        }
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        while (byte < bytes);
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ++ppi->frame;
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm emms
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
299