190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/*
2f71323e297a928af368937089d3ed71239786f86Andreas Huber *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
4f71323e297a928af368937089d3ed71239786f86Andreas Huber *  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber *  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber *  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber *  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber *  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber */
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*   Module Title :     scaleopt.cpp
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*   Description  :     Optimized scaling functions
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber****************************************************************************/
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "pragmas.h"
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber*  Module Statics
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber****************************************************************************/
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short one_fifth[]  = { 51, 51, 51, 51 };
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short two_fifths[] = { 102, 102, 102, 102 };
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short three_fifths[] = { 154, 154, 154, 154 };
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short four_fifths[] = { 205, 205, 205, 205 };
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short four_ones[] = { 1, 1, 1, 1};
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short const45_2[] = {205, 154, 102,  51 };
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short const45_1[] = { 51, 102, 154, 205 };
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned char  mask45[] = { 0, 0, 0, 0, 0, 0, 255, 0};
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short const35_2[] = { 154,  51, 205, 102 };
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short const35_1[] = { 102, 205,  51, 154 };
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "vpx_scale/vpxscale.h"
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber#include "vpx_mem/vpx_mem.h"
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : horizontal_line_3_5_scale_mmx
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : const unsigned char *source :
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int source_width    :
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *dest         :
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width      :
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 3 to 5 up-scaling of a horizontal line of pixels.
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : None.
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid horizontal_line_3_5_scale_mmx
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    const unsigned char *source,
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int source_width,
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) dest_width;
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        push ebx
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    source
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,    dest
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    source_width
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         edx,    [esi+ecx-3];
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    const35_1       // mm5 = 66 xx cd xx 33 xx 9a xx
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    const35_2       // mm6 = 9a xx 33 xx cd xx 66 xx
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7             // clear mm7
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        horiz_line_3_5_loop:
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov        eax,    DWORD PTR [esi] // eax = 00 01 02 03
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov        ebx,    eax
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        and         ebx,    0xffff00        // ebx = xx 01 02 xx
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    eax             // ecx = 00 01 02 03
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        and         eax,    0xffff0000      // eax = xx xx 02 03
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xor         ecx,    eax             // ecx = 00 01 xx xx
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shr         ebx,    8               // ebx = 01 02 xx xx
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        or          eax,    ebx             // eax = 01 02 02 03
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         ebx,    16              // ebx = xx xx 01 02
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1,    eax             // mm1 = 01 02 02 03 xx xx xx xx
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        or          ebx,    ecx             // ebx = 00 01 01 02
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 03 xx
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,    ebx             // mm0 = 00 01 01 02
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm6             //
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5             //
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         [edi],  ebx             // writeoutput 00 xx xx xx
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    3
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    5
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm1
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         esi,    edx
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm7
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD Ptr [edi-4], mm0
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jl          horiz_line_3_5_loop
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber//Exit:
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         eax,    DWORD PTR [esi] // eax = 00 01 02 03
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ebx,    eax
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        and         ebx,    0xffff00        // ebx = xx 01 02 xx
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    eax             // ecx = 00 01 02 03
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        and         eax,    0xffff0000      // eax = xx xx 02 03
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xor         ecx,    eax             // ecx = 00 01 xx xx
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shr         ebx,    8               // ebx = 01 02 xx xx
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        or          eax,    ebx             // eax = 01 02 02 03
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         eax,    8               // eax = xx 01 02 02
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        and         eax,    0xffff0000      // eax = xx xx 02 02
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        or          eax,    ebx             // eax = 01 02 02 02
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shl         ebx,    16              // ebx = xx xx 01 02
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1,    eax             // mm1 = 01 02 02 02 xx xx xx xx
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        or          ebx,    ecx             // ebx = 00 01 01 02
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 02 xx 02 xx
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,    ebx             // mm0 = 00 01 01 02
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm6             //
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 01 xx 02 xx
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5             //
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         [edi],  ebx             // writeoutput 00 xx xx xx
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm1
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm7
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD Ptr [edi+1], mm0
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pop ebx
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : horizontal_line_4_5_scale_mmx
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : const unsigned char *source :
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int source_width    :
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *dest         :
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width      :
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 4 to 5 up-scaling of a horizontal line of pixels.
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : None.
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid horizontal_line_4_5_scale_mmx
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    const unsigned char *source,
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int source_width,
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void)dest_width;
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    source
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,    dest
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    source_width
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         edx,    [esi+ecx-8];
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    const45_1       // mm5 = 33 xx 66 xx 9a xx cd xx
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    const45_2       // mm6 = cd xx 9a xx 66 xx 33 xx
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    round_values     // mm4 = 80 xx 80 xx 80 xx 80 xx
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7             // clear mm7
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        horiz_line_4_5_loop:
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    QWORD PTR [esi]           // mm0 = 00 01 02 03 04 05 06 07
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    QWORD PTR [esi+1];        // mm1 = 01 02 03 04 05 06 07 08
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1             // mm3 = 01 02 03 04 05 06 07 08
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi],  mm0             // write output 00 xx xx xx
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+5], mm2            // write ouput 05 xx xx xx
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    mm6             // 05*205 06*154 07*102 08* 51
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm1             // added round values
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm7
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    10
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    8
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm3             //
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm4             // added round values
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         esi,    edx
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm2,    mm7
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi-4], mm2 // writeoutput 06 07 08 09
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jl         horiz_line_4_5_loop
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber//Exit:
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi]           // mm0 = 00 01 02 03 04 05 06 07
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    mm0             // mm1 = 00 01 02 03 04 05 06 07
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0             // mm2 = 00 01 02 03 04 05 06 07
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm1,    8               // mm1 = 01 02 03 04 05 06 07 00
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mask45          // mm3 = 00 00 00 00 00 00 ff 00
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pand        mm3,    mm1             // mm3 = 00 00 00 00 00 00 07 00
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllq       mm3,    8               // mm3 = 00 00 00 00 00 00 00 07
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,    mm3             // mm1 = 01 02 03 04 05 06 07 07
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi],  mm0   // write output 00 xx xx xx
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7             // mm0 = 00 xx 01 xx 02 xx 03 xx
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7             // mm1 = 01 xx 02 xx 03 xx 04 xx
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5             // 00* 51 01*102 02*154 03*205
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm6             // 01*205 02*154 03*102 04* 51
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7             // mm2 = 04 xx 05 xx 06 xx 07 xx
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+5], mm2  // write ouput 05 xx xx xx
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm5             // 04* 51 05*102 06*154 07*205
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7             // mm3 = 05 xx 06 xx 07 xx 08 xx
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    mm6             // 05*205 06*154 07*102 07* 51
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm1             // added round values
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8               // output: 01 xx 02 xx 03 xx 04 xx
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm7             // 01 02 03 04 xx xx xx xx
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+1], mm0  // write output 01 02 03 04
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm3             //
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm4             // added round values
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm2,    mm7
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+6], mm2  // writeoutput 06 07 08 09
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : vertical_band_4_5_scale_mmx
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : unsigned char *dest    :
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_pitch :
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width :
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 4 to 5 up-scaling of a 4 pixel high band of pixels.
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : The routine uses the first line of the band below
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  the current band. The function also has a "C" only
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  version.
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid vertical_band_4_5_scale_mmx
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_pitch,
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    dest                    // Get the source and destination pointer
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    dest_pitch               // Get the pitch size
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         edi,    [esi+ecx*2]             // tow lines below
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    ecx                     // three lines below
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7                     // clear out mm7
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,    dest_width               // Loop counter
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        vs_4_5_loop:
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    QWORD ptr [esi]         // src[0];
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // Make a copy
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // unpack low to word
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    one_fifth
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // unpack high to word
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5                     // a * 1/5
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1                     // make a copy
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7                     // unpack low to word
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm5                     // a * 1/5
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    four_fifths               // constan
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                     // copy of low b
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm6                     // b * 4/5
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7                     // unpack high to word
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm3                     // copy of high b
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    mm6                     // b * 4/5
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    round_values             // + 128
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    round_values             // + 128
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2                     // des [1]
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm1, mm3 --- Src[1]
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm0 --- Src[2]
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm7 for unpacking
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    two_fifths
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // make a copy
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm5                     // b * 2/5
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    three_fifths
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // unpack low to word
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    mm5                     // b * 2/5
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm0                     // make copy of c
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // unpack high to word
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm6                     // c * 3/5
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm2
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    mm6                     // c * 3/5
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    round_values             // + 128
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    round_values             // + 128
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,    8
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm3,    8
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,    mm3                     // des[2]
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    [edi]                   // mm1=Src[3];
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm0, mm2 --- Src[2]
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm1 --- Src[3]
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm6 --- 3/5
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm7 for unpacking
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm6                     // c * 3/5
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    two_fifths               // mm5 = 2/5
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1                     // make a copy
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm6                     // c * 3/5
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7                     // unpack low
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                     // make a copy
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7                     // unpack high
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm5                     // d * 2/5
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm3                     // make a copy
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,    mm5                     // d * 2/5
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    round_values             // + 128
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    round_values             // + 128
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2                     // des[3]
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [edi], mm0            // write des[3]
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        //  mm1, mm3 --- Src[3]
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        //  mm7 -- cleared for unpacking
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [edi+ecx*2]             // mm0, Src[0] of the next group
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    four_fifths              // mm5 = 4/5
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm5                     // d * 4/5
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    one_fifth                // mm6 = 1/5
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // make a copy
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    mm5                     // d * 4/5
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // unpack low
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm6                     // an * 1/5
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // unpack high
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    mm0                     // d * 4/5 + an * 1/5
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm6                     // an * 1/5
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    mm2                     // d * 4/5 + an * 1/5
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    round_values             // + 128
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    round_values             // + 128
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,    8
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm3,    8
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,    mm3                     // des[4]
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [edi+ecx], mm1        // write des[4]
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    8
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    8
49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         edx,    8
49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg         vs_4_5_loop
49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : last_vertical_band_4_5_scale_mmx
50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : unsigned char *dest    :
50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_pitch :
50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width :
50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : None
50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 4 to 5 up-scaling of the last 4-pixel high band in an image.
51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : The routine uses the first line of the band below
51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  the current band. The function also has an "C" only
51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  version.
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid last_vertical_band_4_5_scale_mmx
51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_pitch,
52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    dest                    // Get the source and destination pointer
52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    dest_pitch               // Get the pitch size
52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         edi,    [esi+ecx*2]             // tow lines below
53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    ecx                     // three lines below
53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7                     // clear out mm7
53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,    dest_width               // Loop counter
53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        last_vs_4_5_loop:
53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    QWORD ptr [esi]         // src[0];
53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // Make a copy
54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // unpack low to word
54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    one_fifth
54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // unpack high to word
54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5                     // a * 1/5
54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1                     // make a copy
55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7                     // unpack low to word
55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm5                     // a * 1/5
55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    four_fifths               // constan
55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                     // copy of low b
55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm6                     // b * 4/5
55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7                     // unpack high to word
55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm3                     // copy of high b
56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    mm6                     // b * 4/5
56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4                     // a * 1/5 + b * 4/5
56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm5                     // a * 1/5 + b * 4/5
56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    round_values             // + 128
56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    round_values             // + 128
56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2                     // des [1]
57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm1, mm3 --- Src[1]
57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm0 --- Src[2]
57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm7 for unpacking
57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    two_fifths
58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // make a copy
58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm5                     // b * 2/5
58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    three_fifths
58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // unpack low to word
58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    mm5                     // b * 2/5
58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm0                     // make copy of c
59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // unpack high to word
59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm6                     // c * 3/5
59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm2
59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    mm6                     // c * 3/5
59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    mm4                     // b * 2/5 + c * 3/5
59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    mm5                     // b * 2/5 + c * 3/5
60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    round_values             // + 128
60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    round_values             // + 128
60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,    8
60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm3,    8
60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,    mm3                     // des[2]
60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    [edi]                   // mm1=Src[3];
61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [edi+ecx], mm1        // write des[4];
61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm0, mm2 --- Src[2]
61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm1 --- Src[3]
61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm6 --- 3/5
61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm7 for unpacking
61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm6                     // c * 3/5
61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    two_fifths               // mm5 = 2/5
62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1                     // make a copy
62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm6                     // c * 3/5
62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7                     // unpack low
62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                     // make a copy
62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7                     // unpack high
62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm5                     // d * 2/5
62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm3                     // make a copy
63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,    mm5                     // d * 2/5
63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4                     // c * 3/5 + d * 2/5
63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm6                     // c * 3/5 + d * 2/5
63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    round_values             // + 128
63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    round_values             // + 128
63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2                     // des[3]
64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [edi], mm0            // write des[3]
64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        //  mm1, mm3 --- Src[3]
64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        //  mm7 -- cleared for unpacking
64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    8
64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    8
64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         edx,    8
65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg          last_vs_4_5_loop
65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : vertical_band_3_5_scale_mmx
65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : unsigned char *dest    :
66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_pitch :
66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width :
66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : The routine uses the first line of the band below
67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  the current band. The function also has an "C" only
67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  version.
67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid vertical_band_3_5_scale_mmx
67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_pitch,
67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    dest                    // Get the source and destination pointer
68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    dest_pitch               // Get the pitch size
68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         edi,    [esi+ecx*2]             // tow lines below
68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    ecx                     // three lines below
68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7                     // clear out mm7
69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,    dest_width               // Loop counter
69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        vs_3_5_loop:
69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    QWORD ptr [esi]         // src[0];
69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // Make a copy
69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // unpack low to word
70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    two_fifths               // mm5 = 2/5
70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // unpack high to word
70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5                     // a * 2/5
70590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1                     // make a copy
70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7                     // unpack low to word
70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm5                     // a * 2/5
71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    three_fifths             // mm6 = 3/5
71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                     // copy of low b
71390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm6                     // b * 3/5
71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7                     // unpack high to word
71690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm3                     // copy of high b
71790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    mm6                     // b * 3/5
71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
72090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    round_values             // + 128
72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    round_values             // + 128
72590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2                     // des [1]
72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm1, mm3 --- Src[1]
73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm0 --- Src[2]
73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm7 for unpacking
73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                     // b low
73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    four_fifths              // b * 4/5 low
73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm3                     // b high
74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    four_fifths              // b * 4/5 high
74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // c
74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    one_fifth                // b * 1/5
74590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // c low
74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    one_fifth                // b * 1/5
74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm0                     // make copy of c low
75090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // c high
75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
75290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,    one_fifth                // c * 1/5 low
75390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,    mm2                     // make copy of c high
75490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
75590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm7,    one_fifth                // c * 1/5 high
75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm0                     // make copy of c low
76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,    four_fifths              // c * 4/5 low
76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,    mm2                     // make copy of c high
76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm7,    four_fifths              // c * 4/5 high
76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
76890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    round_values             // + 128
77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    round_values             // + 128
77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,    8
77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm3,    8
77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,    mm3                     // des[2]
77690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
77790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,    round_values             // + 128
77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,    round_values             // + 128
78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm4,    8
78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm5,    8
78390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm4,    mm5                     // des[3]
78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [edi], mm4            // write des[3]
78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        //  mm0, mm2 --- Src[3]
78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7                     // clear mm7 for unpacking
79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    [edi+ecx*2]             // mm1 = Src[0] of the next group
79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    three_fifths             // mm5 = 3/5
79390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5                     // d * 3/5
79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    two_fifths                // mm6 = 2/5
79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1                     // make a copy
79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm5                     // d * 3/5
79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7                     // unpack low
80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm6                     // an * 2/5
80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7                     // unpack high
80390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm1                     // d * 3/5 + an * 2/5
80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    mm6                     // an * 2/5
80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm3                     // d * 3/5 + an * 2/5
80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    round_values             // + 128
80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    round_values             // + 128
81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
81490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2                     // des[4]
81590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
81790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    8
81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    8
82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         edx,    8
82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg          vs_3_5_loop
82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
82590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : last_vertical_band_3_5_scale_mmx
82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : unsigned char *dest    :
83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_pitch :
83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width :
83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
83890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 3 to 5 up-scaling of a 3-pixel high band of pixels.
83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
84090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : The routine uses the first line of the band below
84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  the current band. The function also has an "C" only
84290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  version.
84390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid last_vertical_band_3_5_scale_mmx
84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_pitch,
85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
85190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
85290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    dest                    // Get the source and destination pointer
85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    dest_pitch               // Get the pitch size
85790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
85890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         edi,    [esi+ecx*2]             // tow lines below
85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    ecx                     // three lines below
86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7                     // clear out mm7
86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,    dest_width               // Loop counter
86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        last_vs_3_5_loop:
86690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    QWORD ptr [esi]         // src[0];
86890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    QWORD ptr [esi+ecx]     // src[1];
86990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
87090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // Make a copy
87190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // unpack low to word
87290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
87390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    two_fifths               // mm5 = 2/5
87490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // unpack high to word
87590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
87690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,    mm5                     // a * 2/5
87790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
87890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1                     // make a copy
87990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7                     // unpack low to word
88090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm5                     // a * 2/5
88290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    three_fifths             // mm6 = 3/5
88390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                     // copy of low b
88590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm6                     // b * 3/5
88690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
88790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7                     // unpack high to word
88890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm3                     // copy of high b
88990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    mm6                     // b * 3/5
89190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm4                     // a * 2/5 + b * 3/5
89290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm5                     // a * 2/5 + b * 3/5
89490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    round_values             // + 128
89590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    round_values             // + 128
89790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
89890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
89990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm2,    8
90090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2                     // des [1]
90190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [esi+ecx], mm0        // write des[1]
90390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi+ecx*2]             // mm0 = src[2]
90490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
90790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm1, mm3 --- Src[1]
90890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm0 --- Src[2]
90990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        // mm7 for unpacking
91090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm1                     // b low
91290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    four_fifths              // b * 4/5 low
91390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [edi+ecx], mm0        // write des[4]
91590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm3                     // b high
91790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    four_fifths              // b * 4/5 high
91890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
91990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // c
92090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    one_fifth                // b * 1/5
92190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
92290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // c low
92390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    one_fifth                // b * 1/5
92490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
92590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm0                     // make copy of c low
92690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // c high
92790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
92890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,    one_fifth                // c * 1/5 low
92990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,    mm2                     // make copy of c high
93090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm7,    one_fifth                // c * 1/5 high
93290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    mm6                     // b * 4/5 + c * 1/5 low
93390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    mm7                     // b * 4/5 + c * 1/5 high
93590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    mm0                     // make copy of c low
93690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
93790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,    four_fifths              // c * 4/5 low
93890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm7,    mm2                     // make copy of c high
93990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm7,    four_fifths              // c * 4/5 high
94190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,    mm6                     // b * 1/5 + c * 4/5 low
94390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,    mm7                     // b * 1/5 + c * 4/5 high
94490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    round_values             // + 128
94690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    round_values             // + 128
94790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
94890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,    8
94990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm3,    8
95090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,    mm3                     // des[2]
95290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [esi+ecx*2], mm1      // write des[2]
95390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm4,    round_values             // + 128
95590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,    round_values             // + 128
95690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
95790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm4,    8
95890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm5,    8
95990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm4,    mm5                     // des[3]
96190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        QWORD ptr [edi], mm4            // write des[3]
96290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        //  mm0, mm2 --- Src[3]
96490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    8
96690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    8
96790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
96890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         edx,    8
96990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg          last_vs_3_5_loop
97090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
97190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
97290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
97390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
97490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
97590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : vertical_band_1_2_scale_mmx
97690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
97790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : unsigned char *dest    :
97890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_pitch :
97990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width :
98090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
98190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
98290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
98390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
98490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
98590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 1 to 2 up-scaling of a band of pixels.
98690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
98790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : The routine uses the first line of the band below
98890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  the current band. The function also has an "C" only
98990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  version.
99090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
99190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
99290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
99390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid vertical_band_1_2_scale_mmx
99490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
99590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
99690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_pitch,
99790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
99890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
99990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
100090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
100190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
100290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    dest                    // Get the source and destination pointer
100490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    dest_pitch               // Get the pitch size
100590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7                     // clear out mm7
100790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,    dest_width               // Loop counter
100890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
100990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        vs_1_2_loop:
101090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi]                   // get Src[0]
101290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    [esi + ecx * 2]         // get Src[1]
101390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0                     // make copy before unpack
101590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1                     // make copy before unpack
101690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
101790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7                     // low Src[0]
101890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    four_ones                // mm6= 1, 1, 1, 1
101990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7                     // low Src[1]
102190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm1                     // low (a + b)
102290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7                     // high Src[0]
102490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm6                     // low (a + b + 1)
102590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7
102790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm3                     // high (a + b )
102890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
102990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,    1                       // low (a + b +1 )/2
103090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm6                     // high (a + b + 1)
103190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2,    1                       // high (a + b + 1)/2
103390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2                     // pack results
103490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [esi+ecx], mm0                  // write out eight bytes
103690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    8
103790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
103890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         edx,    8
103990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg          vs_1_2_loop
104090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
104190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
104290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
104390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
104490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
104590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
104690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : last_vertical_band_1_2_scale_mmx
104790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
104890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : unsigned char *dest    :
104990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_pitch :
105090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width :
105190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
105290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
105390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
105490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
105590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
105690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 1 to 2 up-scaling of band of pixels.
105790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
105890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : The routine uses the first line of the band below
105990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  the current band. The function also has an "C" only
106090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  version.
106190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
106290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
106390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
106490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid last_vertical_band_1_2_scale_mmx
106590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
106690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
106790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_pitch,
106890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
106990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
107090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
107190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
107290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
107390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    dest                    // Get the source and destination pointer
107490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    dest_pitch               // Get the pitch size
107590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,    dest_width               // Loop counter
107790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
107890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        last_vs_1_2_loop:
107990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi]                   // get Src[0]
108190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [esi+ecx], mm0                  // write out eight bytes
108290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    8
108490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         edx,    8
108590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
108690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg         last_vs_1_2_loop
108790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
108890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
108990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
109090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
109190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
109290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : horizontal_line_1_2_scale
109390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
109490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : const unsigned char *source :
109590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int source_width    :
109690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *dest         :
109790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width      :
109890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
109990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
110090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
110190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
110290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
110390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
110490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
110590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : None.
110690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
110790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
110890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
110990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid horizontal_line_1_2_scale_mmx
111090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
111190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    const unsigned char *source,
111290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int source_width,
111390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
111490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
111590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
111690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
111790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) dest_width;
111890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
111990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
112090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
112190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    source
112290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,    dest
112390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
112490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7
112590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    four_ones
112690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
112790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    source_width
112890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
112990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        hs_1_2_loop:
113090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi]
113290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    [esi+1]
113390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0
113590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1
113690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
113790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm0
113890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7
113990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7
114190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm1
114290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm6
114490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7
114590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7
114790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm3
114890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
114990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm6
115090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,    1
115190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2,    1
115390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2
115490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm4
115690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,    mm0
115790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
115890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [edi],  mm2
115990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,    mm0
116090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [edi+8], mm4
116290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    8
116390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    16
116590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         ecx,    8
116690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
116790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         ecx,    8
116890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg          hs_1_2_loop
116990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber// last eight pixel
117190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi]
117390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,    mm0
117490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm0
117690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1
117790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
117890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm1,    8
117990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm3,    56
118090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
118190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllq       mm3,    56
118290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        por         mm1,    mm3
118390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
118490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm1
118590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,    mm0
118690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
118790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,    mm7
118890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7
118990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm1
119190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,    mm6
119290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm2,    mm7
119490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm3,    mm7
119590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm3
119790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm2,    mm6
119890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
119990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm0,    1
120090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psraw       mm2,    1
120190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
120290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm2
120390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm2,    mm4
120490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
120590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,    mm0
120690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [edi],  mm2
120790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
120890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpckhbw   mm4,    mm0
120990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        [edi+8], mm4
121090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
121190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
121290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
121790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
121890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
121990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
122090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
122190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
122290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
122390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : horizontal_line_5_4_scale_mmx
122490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
122590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : const unsigned char *source : Pointer to source data.
122690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int source_width    : Stride of source.
122790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *dest         : Pointer to destination data.
122890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width      : Stride of destination (NOT USED).
122990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
123090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
123190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
123290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
123390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
123490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : Copies horizontal line of pixels from source to
123590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  destination scaling up by 4 to 5.
123690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
123790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : None.
123890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
123990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
124090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
124190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid horizontal_line_5_4_scale_mmx
124290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
124390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    const unsigned char *source,
124490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int source_width,
124590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
124690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
124790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
124890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
124990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    /*
125090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned i;
125190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int a, b, c, d, e;
125290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *des = dest;
125390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    const unsigned char *src = source;
125490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
125590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) dest_width;
125690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
125790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    for ( i=0; i<source_width; i+=5 )
125890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
125990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        a = src[0];
126090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        b = src[1];
126190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        c = src[2];
126290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        d = src[3];
126390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        e = src[4];
126490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
126590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        des[0] = a;
126690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        des[1] = ((b*192 + c* 64 + 128)>>8);
126790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        des[2] = ((c*128 + d*128 + 128)>>8);
126890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        des[3] = ((d* 64 + e*192 + 128)>>8);
126990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
127090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        src += 5;
127190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        des += 4;
127290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
127390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    */
127490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) dest_width;
127590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
127690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
127790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
127890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
127990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,        source              ;
128090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,        dest                ;
128190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
128290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,        source_width         ;
128390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        const54_1           ;
128490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
128590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,        mm7                 ;
128690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        const54_2           ;
128790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
128890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        round_values         ;
128990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         edx,        [esi+ecx]           ;
129090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        horizontal_line_5_4_loop:
129190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
129290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        QWORD PTR  [esi]    ;
129390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        00 01 02 03 04 05 06 07
129490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                 ;
129590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        00 01 02 03 04 05 06 07
129690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
129790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlq       mm0,        8                   ;
129890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        01 02 03 04 05 06 07 xx
129990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,        mm7                 ;
130090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xx 00 xx 01 xx 02 xx 03
130190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm7                 ;
130390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xx 01 xx 02 xx 03 xx 04
130490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,        mm5
130590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,        mm6
130790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,        5
130890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
130990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,        4
131090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,        mm0
131190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,        mm4
131390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,        8
131490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         esi,        edx
131690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,        mm7
131790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
131890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi-4], mm1
131990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jl          horizontal_line_5_4_loop
132190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
132390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
132590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
132690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
132790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
132890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
132990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
133090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
133190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
133290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
133490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
133590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        push        ebx
133690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
133790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    source                    // Get the source and destination pointer
133890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    src_pitch               // Get the pitch size
133990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,    dest                    // tow lines below
134190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7                     // clear out mm7
134290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,    dest_pitch               // Loop counter
134490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ebx,    dest_width
134590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        vs_5_4_loop:
134790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
134890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,    DWORD ptr [esi]         // src[0];
134990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
135090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm2,    DWORD ptr [esi+ecx*2]
135290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         eax,    [esi+ecx*2]             //
135390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7
135590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,    mm7
135690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
135790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm3,    mm2
135890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    three_fourths
135990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    one_fourths
136190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm4,    [eax+ecx]
136290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    two_fourths
136490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,    mm7
136590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    mm4
136790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    two_fourths
136890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
136990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    mm2
137090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm6,    [eax+ecx*2]
137190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm5,    one_fourths
137390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    round_values;
137490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    mm4
137690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,    8
137790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
137890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm6,    mm7
137990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    round_values
138090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
138190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm6,    three_fourths
138290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm3,    8
138390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
138490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,    mm7
138590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm3,    mm7
138690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
138790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi], mm0
138890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+edx], mm1
138990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
139090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
139190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,    mm6
139290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+edx*2], mm3
139390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
139490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         eax,    [edi+edx*2]
139590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm5,    round_values
139690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
139790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm5,    8
139890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    4
139990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm5,    mm7
140190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [eax+edx], mm5
140290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    4
140490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         ebx,    4
140590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg         vs_5_4_loop
140790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
140890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pop         ebx
140990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
141090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
141190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
141290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
141390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
141490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
141590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
141690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
141790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
141890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid horizontal_line_5_3_scale_mmx
141990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
142090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    const unsigned char *source,
142190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int source_width,
142290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
142390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
142490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
142590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
142690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
142790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) dest_width;
142890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
142990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
143090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
143190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,        source              ;
143290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,        dest                ;
143390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
143490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,        source_width         ;
143590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        const53_1           ;
143690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
143790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,        mm7                 ;
143890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        const53_2           ;
143990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
144090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        round_values         ;
144190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         edx,        [esi+ecx-5]         ;
144290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        horizontal_line_5_3_loop:
144390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
144490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        QWORD PTR  [esi]    ;
144590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        00 01 02 03 04 05 06 07
144690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                 ;
144790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        00 01 02 03 04 05 06 07
144890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
144990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0,        8                   ;
145090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xx 00 xx 02 xx 04 xx 06
145190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,        8                   ;
145290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        01 xx 03 xx 05 xx 07 xx
145390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
145490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,        8                   ;
145590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        00 xx 02 xx 04 xx 06 xx
145690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllq       mm1,        16                  ;
145790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xx xx 01 xx 03 xx 05 xx
145890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
145990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,        mm6
146090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,        mm5
146290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,        5
146390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,        3
146590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,        mm0
146690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
146790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,        mm4
146890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,        8
146990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         esi,        edx
147190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,        mm7
147290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi-3], mm1
147490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jl          horizontal_line_5_3_loop
147590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
147690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber//exit condition
147790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,        QWORD PTR  [esi]    ;
147890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        00 01 02 03 04 05 06 07
147990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm1,        mm0                 ;
148090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        00 01 02 03 04 05 06 07
148190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
148290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0,        8                   ;
148390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xx 00 xx 02 xx 04 xx 06
148490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,        8                   ;
148590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        01 xx 03 xx 05 xx 07 xx
148690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
148790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,        8                   ;
148890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        00 xx 02 xx 04 xx 06 xx
148990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllq       mm1,        16                  ;
149090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xx xx 01 xx 03 xx 05 xx
149190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,        mm6
149390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,        mm5
149590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,        mm0
149690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
149790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,        mm4
149890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,        8
149990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
150090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,        mm7
150190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        eax,        mm1
150290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
150390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,        eax
150490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        shr         edx,        16
150590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
150690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         WORD PTR[edi],   ax
150790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         BYTE PTR[edi+2], dl
150890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
150990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
151090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
151290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
151490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
151590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
151690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
151790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
151890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
151990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
152090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
152190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
152290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        push        ebx
152390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
152490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    source                    // Get the source and destination pointer
152590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    src_pitch               // Get the pitch size
152690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
152790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,    dest                    // tow lines below
152890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7                     // clear out mm7
152990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,    dest_pitch               // Loop counter
153190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,    one_thirds
153290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,    two_thirds
153490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ebx,    dest_width;
153590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        vs_5_3_loop:
153790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
153890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,    DWORD ptr [esi]         // src[0];
153990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
154090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm2,    DWORD ptr [esi+ecx*2]
154290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         eax,    [esi+ecx*2]             //
154390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,    mm7
154590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,    mm7
154690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
154790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,    mm5
154890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,    mm6
154990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm3,    DWORD ptr [eax+ecx]
155190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm4,    DWORD ptr [eax+ecx*2]
155290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm3,    mm7
155490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm4,    mm7
155590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm3,    mm6
155790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm4,    mm5
155890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
155990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi], mm0
156190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    mm2
156290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,    round_values
156490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm1,    8
156590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm1,    mm7
156790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    mm4
156890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
156990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm3,    round_values
157090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+edx], mm1
157190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm3,    8
157390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm3,    mm7
157490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi+edx*2], mm3
157690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
157890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,    4
157990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,    4
158090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         ebx,    4
158290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jg          vs_5_3_loop
158390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pop         ebx
158590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
158690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
158790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
158990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
159090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
159190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber/****************************************************************************
159290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
159390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  ROUTINE       : horizontal_line_2_1_scale
159490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
159590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  INPUTS        : const unsigned char *source :
159690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int source_width    :
159790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned char *dest         :
159890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *                  unsigned int dest_width      :
159990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
160090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  OUTPUTS       : None.
160190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
160290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  RETURNS       : void
160390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
160490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
160590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
160690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *  SPECIAL NOTES : None.
160790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber *
160890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber ****************************************************************************/
160990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
161090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid horizontal_line_2_1_scale_mmx
161190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber(
161290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    const unsigned char *source,
161390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int source_width,
161490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned char *dest,
161590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    unsigned int dest_width
161690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber)
161790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
161890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) dest_width;
161990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) source_width;
162090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
162190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
162290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,    source
162390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,    dest
162490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,    mm7
162690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         ecx,    dest_width
162790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
162890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        xor         edx,    edx
162990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        hs_2_1_loop:
163090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm0,    [esi+edx*2]
163290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psllw       mm0,    8
163390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,    8
163590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,    mm7
163690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
163790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD Ptr [edi+edx], mm0;
163890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edx,    4
163990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         edx,    ecx
164190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jl          hs_2_1_loop
164290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
164490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
164590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
164890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
164990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
165090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
165190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) dest_pitch;
165290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) src_pitch;
165390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpx_memcpy(dest, source, dest_width);
165490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
165590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
165790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
165890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
165990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberstatic
166190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width)
166290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
166390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
166490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    (void) dest_pitch;
166590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    __asm
166690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    {
166790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         esi,        source
166890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edi,        dest
166990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         eax,        src_pitch
167190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        mov         edx,        dest_width
167290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pxor        mm7,        mm7
167490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        sub         esi,        eax             //back one line
167590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
167790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        lea         ecx,        [esi+edx];
167890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm6,        round_values;
167990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm5,        three_sixteenths;
168190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movq        mm4,        ten_sixteenths;
168290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        vs_2_1_i_loop:
168490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm0,        [esi]           //
168590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm1,        [esi+eax]       //
168690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
168790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        mm2,        [esi+eax*2]     //
168890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm0,        mm7
168990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm0,        mm5
169190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm1,        mm7
169290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm1,        mm4
169490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        punpcklbw   mm2,        mm7
169590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        pmullw      mm2,        mm5
169790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,        round_values
169890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
169990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm1,        mm2
170090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        paddw       mm0,        mm1
170190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        psrlw       mm0,        8
170390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        packuswb    mm0,        mm7
170490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        movd        DWORD PTR [edi],        mm0
170690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         esi,        4
170790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
170890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        add         edi,        4;
170990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        cmp         esi,        ecx
171090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber        jl          vs_2_1_i_loop
171190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    }
171390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
171490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
171790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervoid
171890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberregister_mmxscalers(void)
171990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber{
172090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_horizontal_line_1_2_scale        = horizontal_line_1_2_scale_mmx;
172190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_1_2_scale          = vertical_band_1_2_scale_mmx;
172290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_last_vertical_band_1_2_scale      = last_vertical_band_1_2_scale_mmx;
172390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_horizontal_line_3_5_scale        = horizontal_line_3_5_scale_mmx;
172490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_3_5_scale          = vertical_band_3_5_scale_mmx;
172590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_last_vertical_band_3_5_scale      = last_vertical_band_3_5_scale_mmx;
172690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_horizontal_line_4_5_scale        = horizontal_line_4_5_scale_mmx;
172790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_4_5_scale          = vertical_band_4_5_scale_mmx;
172890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_last_vertical_band_4_5_scale      = last_vertical_band_4_5_scale_mmx;
172990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
173090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_horizontal_line_3_4_scale        = vp8cx_horizontal_line_3_4_scale_c;
173190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_3_4_scale          = vp8cx_vertical_band_3_4_scale_c;
173290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_last_vertical_band_3_4_scale      = vp8cx_last_vertical_band_3_4_scale_c;
173390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_horizontal_line_2_3_scale        = vp8cx_horizontal_line_2_3_scale_c;
173490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_2_3_scale          = vp8cx_vertical_band_2_3_scale_c;
173590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_last_vertical_band_2_3_scale      = vp8cx_last_vertical_band_2_3_scale_c;
173690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
173790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
173890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
173990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
174090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
174190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
174290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
174390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
174490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
174590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
174690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
174790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
174890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
174990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
175090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber}
1751