1e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell/*
2e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
39ede048127ea71282fd97e01516dedcfb03e2a23Brian *
422144ab7552f0799bcfca506bf4ffa7f70a06649Gareth Hughes *  Use of this source code is governed by a BSD-style license
59ede048127ea71282fd97e01516dedcfb03e2a23Brian *  that can be found in the LICENSE file in the root of the source
622144ab7552f0799bcfca506bf4ffa7f70a06649Gareth Hughes *  tree. An additional intellectual property rights grant can be found
7e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell *  in the file PATENTS.  All contributing project authors may
8e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell *  be found in the AUTHORS file in the root of the source tree.
9e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell */
10e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
11e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
12e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell#include <stdlib.h>
1322144ab7552f0799bcfca506bf4ffa7f70a06649Gareth Hughes#include "vp8_rtcd.h"
14e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell#include "vpx_ports/mem.h"
15e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
1622144ab7552f0799bcfca506bf4ffa7f70a06649Gareth Hughes#if HAVE_DSPR2
17e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell#define CROP_WIDTH 256
18e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwellunsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
19e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
20e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwellstatic const unsigned short sub_pel_filterss[8][3] =
21e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell{
22e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    {      0,      0,      0},
23e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    {      0, 0x0601, 0x7b0c},
24e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    { 0x0201, 0x0b08, 0x6c24},
25e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    {      0, 0x0906, 0x5d32},
26e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    { 0x0303, 0x1010, 0x4d4d},
2729b4076f9acff96a867760fc885f5eaeb7586977Brian Paul    {      0, 0x0609, 0x325d},
2846b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell    { 0x0102, 0x080b, 0x246c},
29e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    {      0, 0x0106, 0x0c7b},
300070d398d13759adc519f9bc764ffd39bc88890eBrian Paul};
31cd03ed4f54444d96e4e47cdb118a3dfd94d92bb0Keith Whitwell
32e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
33cd03ed4f54444d96e4e47cdb118a3dfd94d92bb0Keith Whitwellstatic const int sub_pel_filters_int[8][3] =
34b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul{
35b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    {          0,          0,          0},
36e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    { 0x0000fffa, 0x007b000c, 0xffff0000},
37e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    { 0x0002fff5, 0x006c0024, 0xfff80001},
38b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0x0000fff7, 0x005d0032, 0xfffa0000},
39b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0x0003fff0, 0x004d004d, 0xfff00003},
40b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0x0000fffa, 0x0032005d, 0xfff70000},
41b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0x0001fff8, 0x0024006c, 0xfff50002},
42b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0x0000ffff, 0x000c007b, 0xfffa0000},
43b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul};
44b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul
45b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul
46e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwellstatic const int sub_pel_filters_inv[8][3] =
47b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul{
48b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    {          0,          0,          0},
49b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xfffa0000, 0x000c007b, 0x0000ffff},
50b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xfff50002, 0x0024006c, 0x0001fff8},
51b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xfff70000, 0x0032005d, 0x0000fffa},
52b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xfff00003, 0x004d004d, 0x0003fff0},
53b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xfffa0000, 0x005d0032, 0x0000fff7},
54b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xfff80001, 0x006c0024, 0x0002fff5},
55b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xffff0000, 0x007b000c, 0x0000fffa},
56b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul};
57b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul
58e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
59e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwellstatic const int sub_pel_filters_int_tap_4[8][2] =
60e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell{
61b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    {          0,          0},
62e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    { 0xfffa007b, 0x000cffff},
63b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    {          0,          0},
64cdb27e8242215271364602995d85607cfc06d441Brian Paul    { 0xfff7005d, 0x0032fffa},
65e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    {          0,          0},
66b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xfffa0032, 0x005dfff7},
67cd03ed4f54444d96e4e47cdb118a3dfd94d92bb0Keith Whitwell    {          0,          0},
68b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0xffff000c, 0x007bfffa},
69e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell};
7054ef88109b3e135f7cc1feabbbc7dbf640a5d8ccKeith Whitwell
71e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
72b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paulstatic const int sub_pel_filters_inv_tap_4[8][2] =
73b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul{
74b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    {          0,          0},
75b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0x007bfffa, 0xffff000c},
76b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    {          0,          0},
77b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0x005dfff7, 0xfffa0032},
7877df88727cb0a423dd5cb41498c2302d9df4fce7Brian Paul    {          0,          0},
79a670c1280b78e6da3b298b61f623e4c733c6be94Brian Paul    { 0x0032fffa, 0xfff7005d},
80a670c1280b78e6da3b298b61f623e4c733c6be94Brian Paul    {          0,          0},
81b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    { 0x000cffff, 0xfffa007b},
82b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul};
83b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul
8477df88727cb0a423dd5cb41498c2302d9df4fce7Brian Paulinline void prefetch_load(unsigned char *src)
85b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul{
86b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    __asm__ __volatile__ (
87b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul        "pref   0,  0(%[src])   \n\t"
8877df88727cb0a423dd5cb41498c2302d9df4fce7Brian Paul        :
89b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul        : [src] "r" (src)
90a803b0c891404dcd7c376e91f6a033cd4e42abc3Brian Paul    );
9145bc887da226403f2c41077e40ca38b6f60f1359Brian Paul}
92b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul
9345bc887da226403f2c41077e40ca38b6f60f1359Brian Paul
94b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paulinline void prefetch_store(unsigned char *dst)
95b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul{
96b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    __asm__ __volatile__ (
9777df88727cb0a423dd5cb41498c2302d9df4fce7Brian Paul        "pref   1,  0(%[dst])   \n\t"
98a670c1280b78e6da3b298b61f623e4c733c6be94Brian Paul        :
99a670c1280b78e6da3b298b61f623e4c733c6be94Brian Paul        : [dst] "r" (dst)
100b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    );
101b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul}
102b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul
10377df88727cb0a423dd5cb41498c2302d9df4fce7Brian Paulvoid dsputil_static_init(void)
104b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul{
105b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    int i;
106b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul
10777df88727cb0a423dd5cb41498c2302d9df4fce7Brian Paul    for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i;
108b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul
109a803b0c891404dcd7c376e91f6a033cd4e42abc3Brian Paul    for (i = 0; i < CROP_WIDTH; i++)
11045bc887da226403f2c41077e40ca38b6f60f1359Brian Paul    {
111b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul        ff_cropTbl[i] = 0;
11245bc887da226403f2c41077e40ca38b6f60f1359Brian Paul        ff_cropTbl[i + CROP_WIDTH + 256] = 255;
113b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    }
114b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul}
115e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
116e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwellvoid vp8_filter_block2d_first_pass_4
117e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell(
118e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    unsigned char *RESTRICT src_ptr,
119b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    unsigned char *RESTRICT dst_ptr,
120b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    unsigned int src_pixels_per_line,
121b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    unsigned int output_height,
122b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul    int xoffset,
12322a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    int pitch
12422a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul)
12522a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul{
12645bc887da226403f2c41077e40ca38b6f60f1359Brian Paul    unsigned int i;
127e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    int Temp1, Temp2, Temp3, Temp4;
128e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
12922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    unsigned int vector4a = 64;
13022a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    int vector1b, vector2b, vector3b;
13122a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    unsigned int tp1, tp2, tn1, tn2;
13245bc887da226403f2c41077e40ca38b6f60f1359Brian Paul    unsigned int p1, p2, p3;
133e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    unsigned int n1, n2, n3;
134e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
135e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
13622a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    vector3b = sub_pel_filters_inv[xoffset][2];
13722a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul
13822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    /* if (xoffset == 0) we don't need any filtering */
13922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    if (vector3b == 0)
14022a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    {
14122a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul        for (i = 0; i < output_height; i++)
14222a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul        {
14322a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            /* prefetch src_ptr data to cache memory */
144dc24230de7f913969b52dee3579bb8fa3d50a8c0Karl Schultz            prefetch_load(src_ptr + src_pixels_per_line);
14522a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            dst_ptr[0] = src_ptr[0];
14654ef88109b3e135f7cc1feabbbc7dbf640a5d8ccKeith Whitwell            dst_ptr[1] = src_ptr[1];
147dc24230de7f913969b52dee3579bb8fa3d50a8c0Karl Schultz            dst_ptr[2] = src_ptr[2];
14822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            dst_ptr[3] = src_ptr[3];
14922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul
15045bc887da226403f2c41077e40ca38b6f60f1359Brian Paul            /* next row... */
151b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul            src_ptr += src_pixels_per_line;
152e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell            dst_ptr += 4;
153e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell        }
154e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    }
15522a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    else
15622a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul    {
15722a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul        if (vector3b > 65536)
15822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul        {
15922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            /* 6 tap filter */
16022a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul
16122a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            vector1b = sub_pel_filters_inv[xoffset][0];
16222a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            vector2b = sub_pel_filters_inv[xoffset][1];
16322a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul
16422a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            /* prefetch src_ptr data to cache memory */
16554ef88109b3e135f7cc1feabbbc7dbf640a5d8ccKeith Whitwell            prefetch_load(src_ptr + src_pixels_per_line);
166dc24230de7f913969b52dee3579bb8fa3d50a8c0Karl Schultz
16722a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            for (i = output_height; i--;)
16822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul            {
16945bc887da226403f2c41077e40ca38b6f60f1359Brian Paul                /* apply filter with vectors pairs */
170e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                __asm__ __volatile__ (
171e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
172b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul                    "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
173e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
1749ede048127ea71282fd97e01516dedcfb03e2a23Brian                    /* even 1. pixel */
17522a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "mtlo             %[vector4a], $ac3                           \n\t"
17622a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
17722a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
17822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
17922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
1809ede048127ea71282fd97e01516dedcfb03e2a23Brian                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
18122a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
18222a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul
18322a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    /* even 2. pixel */
18422a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "mtlo             %[vector4a], $ac2                           \n\t"
18522a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
18654ef88109b3e135f7cc1feabbbc7dbf640a5d8ccKeith Whitwell                    "balign           %[tp2],      %[tp1],         3              \n\t"
187dc24230de7f913969b52dee3579bb8fa3d50a8c0Karl Schultz                    "extp             %[Temp1],    $ac3,           9              \n\t"
18822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
18922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
190a803b0c891404dcd7c376e91f6a033cd4e42abc3Brian Paul                    "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
191e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
192e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    /* odd 1. pixel */
193b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul                    "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
194e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "mtlo             %[vector4a], $ac3                           \n\t"
195e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
19622a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
19722a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
19822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "extp             %[Temp3],    $ac2,           9              \n\t"
19922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
20046b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
20146b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
20246b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell
20346b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    /* even 2. pixel */
20446b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "mtlo             %[vector4a], $ac2                           \n\t"
20546b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
20646b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "extp             %[Temp2],    $ac3,           9              \n\t"
20746b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
20846b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
20946b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
21046b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    "extp             %[Temp4],    $ac2,           9              \n\t"
21146b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell
21246b0988c673b28e072fd0cbf477632a9ab6f9f18Keith Whitwell                    /* clamp */
213e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
214e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
215ba41b8afb1b40b967cf5c0e604bbf09793eb8feeBrian Paul                    "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
216ba41b8afb1b40b967cf5c0e604bbf09793eb8feeBrian Paul                    "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
217e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
218e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    /* store bytes */
219e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
220cd03ed4f54444d96e4e47cdb118a3dfd94d92bb0Keith Whitwell                    "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
221cd03ed4f54444d96e4e47cdb118a3dfd94d92bb0Keith Whitwell                    "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
22268a7ee5cbbc93c0e746dc843e0fcc1e65ad1baf6Brian Paul                    "sb               %[n2],       3(%[dst_ptr])                  \n\t"
22322a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul
22422a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
22522a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                      [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
22622a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                      [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2),
22722a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                      [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
22822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
22922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
23022a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
23122a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                      [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr)
23222a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                );
233e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
234b5b8d22c4ee921dff99b898a5907023b20670a27Brian Paul                /* Next row... */
235e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                src_ptr += src_pixels_per_line;
236e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                dst_ptr += pitch;
237e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell            }
238e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell        }
239e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell        else
240233aafbb30594d0193b00705d0532be97060ebd1Brian Paul        {
241233aafbb30594d0193b00705d0532be97060ebd1Brian Paul            /* 4 tap filter */
242233aafbb30594d0193b00705d0532be97060ebd1Brian Paul
243233aafbb30594d0193b00705d0532be97060ebd1Brian Paul            vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
244233aafbb30594d0193b00705d0532be97060ebd1Brian Paul            vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
245233aafbb30594d0193b00705d0532be97060ebd1Brian Paul
246233aafbb30594d0193b00705d0532be97060ebd1Brian Paul            for (i = output_height; i--;)
247233aafbb30594d0193b00705d0532be97060ebd1Brian Paul            {
24868a7ee5cbbc93c0e746dc843e0fcc1e65ad1baf6Brian Paul                /* apply filter with vectors pairs */
249233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                __asm__ __volatile__ (
250233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                    "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
251233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                    "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
252233aafbb30594d0193b00705d0532be97060ebd1Brian Paul
253233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                    /* even 1. pixel */
254c123a9b2edc5852cb50485f344219508254081a8Brian Paul                    "mtlo             %[vector4a], $ac3                           \n\t"
255233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
256233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
257233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
258233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
259233aafbb30594d0193b00705d0532be97060ebd1Brian Paul                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
260e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
261e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    /* even 2. pixel */
262e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "mtlo             %[vector4a], $ac2                           \n\t"
263e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
264e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
265e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "extp             %[Temp1],    $ac3,           9              \n\t"
266e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell
267e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    /* odd 1. pixel */
268cd03ed4f54444d96e4e47cdb118a3dfd94d92bb0Keith Whitwell                    "srl              %[tn1],      %[tp2],         8              \n\t"
269e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "balign           %[tp2],      %[tp1],         3              \n\t"
270cd03ed4f54444d96e4e47cdb118a3dfd94d92bb0Keith Whitwell                    "mtlo             %[vector4a], $ac3                           \n\t"
271b6bcae5698df88f7730d40004ce7ce0462e97a20Brian Paul                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
272e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
273b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul                    "preceu.ph.qbr    %[n3],       %[tn1]                         \n\t"
274e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "extp             %[Temp3],    $ac2,           9              \n\t"
275e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
2760070d398d13759adc519f9bc764ffd39bc88890eBrian Paul                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
27764b4298181373d64ef2226935f70e9062536de8bBrian Paul
278e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    /* odd 2. pixel */
279ccea3ff8a9eae16d2ca11b9cedef1318cffe3fb4Brian                    "mtlo             %[vector4a], $ac2                           \n\t"
280ccea3ff8a9eae16d2ca11b9cedef1318cffe3fb4Brian                    "extp             %[Temp2],    $ac3,           9              \n\t"
28122a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
2829ede048127ea71282fd97e01516dedcfb03e2a23Brian                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
283e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "extp             %[Temp4],    $ac2,           9              \n\t"
284dcf4c17fb1624af47181c63af4c3ad29f919c17aBrian Paul
28522a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    /* clamp and store results */
28622a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
28722a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
28822a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
28922a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
29022a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
29122a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
292e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                    "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
29322a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    "sb               %[n2],       3(%[dst_ptr])                  \n\t"
29422a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul
29522a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
29622a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
29722a47c5251ee7b91dc8f7f4f7dbeb3ad5a117b70Brian Paul                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
298e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
299e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
300b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
30145bc887da226403f2c41077e40ca38b6f60f1359Brian Paul                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
302e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                      [src_ptr] "r" (src_ptr)
303e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                );
304b37a084357dd08573b86d6d8c5ba43d65bdc1bd7Brian Paul                /*  Next row... */
30545bc887da226403f2c41077e40ca38b6f60f1359Brian Paul                src_ptr += src_pixels_per_line;
306e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell                dst_ptr += pitch;
307e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell            }
308e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell        }
309e3a051e0538a605551f4d58294c94f5eb00ed07fKeith Whitwell    }
310}
311
312void vp8_filter_block2d_first_pass_8_all
313(
314    unsigned char *RESTRICT src_ptr,
315    unsigned char *RESTRICT dst_ptr,
316    unsigned int src_pixels_per_line,
317    unsigned int output_height,
318    int xoffset,
319    int pitch
320)
321{
322    unsigned int i;
323    int Temp1, Temp2, Temp3, Temp4;
324
325    unsigned int vector4a = 64;
326    unsigned int vector1b, vector2b, vector3b;
327    unsigned int tp1, tp2, tn1, tn2;
328    unsigned int p1, p2, p3, p4;
329    unsigned int n1, n2, n3, n4;
330
331    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
332
333    /* if (xoffset == 0) we don't need any filtering */
334    if (xoffset == 0)
335    {
336        for (i = 0; i < output_height; i++)
337        {
338            /* prefetch src_ptr data to cache memory */
339            prefetch_load(src_ptr + src_pixels_per_line);
340
341            dst_ptr[0] = src_ptr[0];
342            dst_ptr[1] = src_ptr[1];
343            dst_ptr[2] = src_ptr[2];
344            dst_ptr[3] = src_ptr[3];
345            dst_ptr[4] = src_ptr[4];
346            dst_ptr[5] = src_ptr[5];
347            dst_ptr[6] = src_ptr[6];
348            dst_ptr[7] = src_ptr[7];
349
350            /* next row... */
351            src_ptr += src_pixels_per_line;
352            dst_ptr += 8;
353        }
354    }
355    else
356    {
357        vector3b = sub_pel_filters_inv[xoffset][2];
358
359        if (vector3b > 65536)
360        {
361            /* 6 tap filter */
362
363            vector1b = sub_pel_filters_inv[xoffset][0];
364            vector2b = sub_pel_filters_inv[xoffset][1];
365
366            for (i = output_height; i--;)
367            {
368                /* prefetch src_ptr data to cache memory */
369                prefetch_load(src_ptr + src_pixels_per_line);
370
371                /* apply filter with vectors pairs */
372                __asm__ __volatile__ (
373                    "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
374                    "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
375
376                    /* even 1. pixel */
377                    "mtlo             %[vector4a], $ac3                           \n\t"
378                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
379                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
380                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
381                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
382                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
383                    "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
384
385                    /* even 2. pixel */
386                    "mtlo             %[vector4a], $ac2                           \n\t"
387                    "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
388                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
389                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
390                    "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
391
392                    "balign           %[tp2],      %[tp1],         3              \n\t"
393                    "extp             %[Temp1],    $ac3,           9              \n\t"
394                    "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
395
396                    /* odd 1. pixel */
397                    "mtlo             %[vector4a], $ac3                           \n\t"
398                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
399                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
400                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
401                    "extp             %[Temp3],    $ac2,           9              \n\t"
402                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
403                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
404                    "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
405
406                    /* odd 2. pixel */
407                    "mtlo             %[vector4a], $ac2                           \n\t"
408                    "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
409                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
410                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
411                    "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
412                    "ulw              %[tp1],      6(%[src_ptr])                  \n\t"
413                    "extp             %[Temp2],    $ac3,           9              \n\t"
414                    "mtlo             %[vector4a], $ac3                           \n\t"
415                    "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
416                    "extp             %[Temp4],    $ac2,           9              \n\t"
417
418                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
419                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
420                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
421                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
422                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
423                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
424                      [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
425                      [src_ptr] "r" (src_ptr)
426                );
427
428                /* clamp and store results */
429                dst_ptr[0] = cm[Temp1];
430                dst_ptr[1] = cm[Temp2];
431                dst_ptr[2] = cm[Temp3];
432                dst_ptr[3] = cm[Temp4];
433
434                /* next 4 pixels */
435                __asm__ __volatile__ (
436                    /* even 3. pixel */
437                    "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
438                    "dpa.w.ph         $ac3,        %[p1],          %[vector2b]    \n\t"
439                    "dpa.w.ph         $ac3,        %[p2],          %[vector3b]    \n\t"
440
441                    /* even 4. pixel */
442                    "mtlo             %[vector4a], $ac2                           \n\t"
443                    "preceu.ph.qbl    %[p4],       %[tp1]                         \n\t"
444                    "dpa.w.ph         $ac2,        %[p1],          %[vector1b]    \n\t"
445                    "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
446                    "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
447
448                    "ulw              %[tn1],      7(%[src_ptr])                  \n\t"
449                    "extp             %[Temp1],    $ac3,           9              \n\t"
450
451                    /* odd 3. pixel */
452                    "mtlo             %[vector4a], $ac3                           \n\t"
453                    "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
454                    "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
455                    "dpa.w.ph         $ac3,        %[n1],          %[vector2b]    \n\t"
456                    "dpa.w.ph         $ac3,        %[n2],          %[vector3b]    \n\t"
457                    "extp             %[Temp3],    $ac2,           9              \n\t"
458
459                    /* odd 4. pixel */
460                    "mtlo             %[vector4a], $ac2                           \n\t"
461                    "preceu.ph.qbl    %[n4],       %[tn1]                         \n\t"
462                    "dpa.w.ph         $ac2,        %[n1],          %[vector1b]    \n\t"
463                    "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
464                    "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
465                    "extp             %[Temp2],    $ac3,           9              \n\t"
466                    "extp             %[Temp4],    $ac2,           9              \n\t"
467
468                    : [tn1] "=&r" (tn1), [n2] "=&r" (n2),
469                      [p4] "=&r" (p4), [n4] "=&r" (n4),
470                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
471                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
472                    : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2),
473                      [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1),
474                      [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
475                      [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
476                );
477
478                /* clamp and store results */
479                dst_ptr[4] = cm[Temp1];
480                dst_ptr[5] = cm[Temp2];
481                dst_ptr[6] = cm[Temp3];
482                dst_ptr[7] = cm[Temp4];
483
484                src_ptr += src_pixels_per_line;
485                dst_ptr += pitch;
486            }
487        }
488        else
489        {
490            /* 4 tap filter */
491
492            vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
493            vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
494
495            for (i = output_height; i--;)
496            {
497                /* prefetch src_ptr data to cache memory */
498                prefetch_load(src_ptr + src_pixels_per_line);
499
500                /* apply filter with vectors pairs */
501                __asm__ __volatile__ (
502                    "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
503
504                    /* even 1. pixel */
505                    "mtlo             %[vector4a], $ac3                           \n\t"
506                    "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
507                    "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
508                    "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
509                    "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
510
511                    "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
512
513                    /* even 2. pixel  */
514                    "mtlo             %[vector4a], $ac2                           \n\t"
515                    "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
516                    "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
517                    "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
518                    "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
519                    "extp             %[Temp1],    $ac3,           9              \n\t"
520
521                    "balign           %[tp2],      %[tp1],         3              \n\t"
522
523                    /* odd 1. pixel */
524                    "mtlo             %[vector4a], $ac3                           \n\t"
525                    "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
526                    "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
527                    "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
528                    "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
529                    "extp             %[Temp3],    $ac2,           9              \n\t"
530
531                    "ulw              %[tn2],      4(%[src_ptr])                  \n\t"
532
533                    /* odd 2. pixel */
534                    "mtlo             %[vector4a], $ac2                           \n\t"
535                    "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
536                    "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
537                    "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
538                    "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
539                    "ulw              %[tp1],      7(%[src_ptr])                  \n\t"
540                    "extp             %[Temp2],    $ac3,           9              \n\t"
541                    "mtlo             %[vector4a], $ac3                           \n\t"
542                    "extp             %[Temp4],    $ac2,           9              \n\t"
543
544                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
545                      [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
546                      [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1),
547                      [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
548                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
549                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
550                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
551                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
552                );
553
554                /* clamp and store results */
555                dst_ptr[0] = cm[Temp1];
556                dst_ptr[1] = cm[Temp2];
557                dst_ptr[2] = cm[Temp3];
558                dst_ptr[3] = cm[Temp4];
559
560                /* next 4 pixels */
561                __asm__ __volatile__ (
562                    /* even 3. pixel */
563                    "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
564                    "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
565
566                    /* even 4. pixel */
567                    "mtlo             %[vector4a], $ac2                           \n\t"
568                    "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
569                    "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
570                    "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
571                    "extp             %[Temp1],    $ac3,           9              \n\t"
572
573                    /* odd 3. pixel */
574                    "mtlo             %[vector4a], $ac3                           \n\t"
575                    "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
576                    "dpa.w.ph         $ac3,        %[n4],          %[vector2b]    \n\t"
577                    "ulw              %[tn1],      8(%[src_ptr])                  \n\t"
578                    "extp             %[Temp3],    $ac2,           9              \n\t"
579
580                    /* odd 4. pixel */
581                    "mtlo             %[vector4a], $ac2                           \n\t"
582                    "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
583                    "dpa.w.ph         $ac2,        %[n4],          %[vector1b]    \n\t"
584                    "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
585                    "extp             %[Temp2],    $ac3,           9              \n\t"
586                    "extp             %[Temp4],    $ac2,           9              \n\t"
587
588                    : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2),
589                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
590                      [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
591                    : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4),
592                      [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
593                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr),
594                      [n3] "r" (n3), [n4] "r" (n4)
595                );
596
597                /* clamp and store results */
598                dst_ptr[4] = cm[Temp1];
599                dst_ptr[5] = cm[Temp2];
600                dst_ptr[6] = cm[Temp3];
601                dst_ptr[7] = cm[Temp4];
602
603                /* next row... */
604                src_ptr += src_pixels_per_line;
605                dst_ptr += pitch;
606            }
607        }
608    }
609}
610
611
612void vp8_filter_block2d_first_pass16_6tap
613(
614    unsigned char *RESTRICT src_ptr,
615    unsigned char *RESTRICT dst_ptr,
616    unsigned int src_pixels_per_line,
617    unsigned int output_height,
618    int xoffset,
619    int pitch
620)
621{
622    unsigned int i;
623    int Temp1, Temp2, Temp3, Temp4;
624
625    unsigned int vector4a;
626    unsigned int vector1b, vector2b, vector3b;
627    unsigned int tp1, tp2, tn1, tn2;
628    unsigned int p1, p2, p3, p4;
629    unsigned int n1, n2, n3, n4;
630    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
631
632    vector1b = sub_pel_filters_inv[xoffset][0];
633    vector2b = sub_pel_filters_inv[xoffset][1];
634    vector3b = sub_pel_filters_inv[xoffset][2];
635    vector4a = 64;
636
637    for (i = output_height; i--;)
638    {
639        /* prefetch src_ptr data to cache memory */
640        prefetch_load(src_ptr + src_pixels_per_line);
641
642        /* apply filter with vectors pairs */
643        __asm__ __volatile__ (
644            "ulw                %[tp1],      -2(%[src_ptr])                 \n\t"
645            "ulw                %[tp2],      2(%[src_ptr])                  \n\t"
646
647            /* even 1. pixel */
648            "mtlo               %[vector4a], $ac3                           \n\t"
649            "preceu.ph.qbr      %[p1],       %[tp1]                         \n\t"
650            "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
651            "preceu.ph.qbr      %[p3],       %[tp2]                         \n\t"
652            "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
653            "dpa.w.ph           $ac3,        %[p2],           %[vector2b]   \n\t"
654            "dpa.w.ph           $ac3,        %[p3],           %[vector3b]   \n\t"
655
656            /* even 2. pixel */
657            "mtlo               %[vector4a], $ac2                           \n\t"
658            "preceu.ph.qbl      %[p1],       %[tp2]                         \n\t"
659            "dpa.w.ph           $ac2,        %[p2],           %[vector1b]   \n\t"
660            "dpa.w.ph           $ac2,        %[p3],           %[vector2b]   \n\t"
661            "dpa.w.ph           $ac2,        %[p1],           %[vector3b]   \n\t"
662
663            "balign             %[tp2],      %[tp1],          3             \n\t"
664            "ulw                %[tn2],      3(%[src_ptr])                  \n\t"
665            "extp               %[Temp1],    $ac3,            9             \n\t"
666
667            /* odd 1. pixel */
668            "mtlo               %[vector4a], $ac3                           \n\t"
669            "preceu.ph.qbr      %[n1],       %[tp2]                         \n\t"
670            "preceu.ph.qbl      %[n2],       %[tp2]                         \n\t"
671            "preceu.ph.qbr      %[n3],       %[tn2]                         \n\t"
672            "extp               %[Temp3],    $ac2,            9             \n\t"
673            "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
674            "dpa.w.ph           $ac3,        %[n2],           %[vector2b]   \n\t"
675            "dpa.w.ph           $ac3,        %[n3],           %[vector3b]   \n\t"
676
677            /* odd 2. pixel */
678            "mtlo               %[vector4a], $ac2                           \n\t"
679            "preceu.ph.qbl      %[n1],       %[tn2]                         \n\t"
680            "dpa.w.ph           $ac2,        %[n2],           %[vector1b]   \n\t"
681            "dpa.w.ph           $ac2,        %[n3],           %[vector2b]   \n\t"
682            "dpa.w.ph           $ac2,        %[n1],           %[vector3b]   \n\t"
683            "ulw                %[tp1],      6(%[src_ptr])                  \n\t"
684            "extp               %[Temp2],    $ac3,            9             \n\t"
685            "mtlo               %[vector4a], $ac3                           \n\t"
686            "preceu.ph.qbr      %[p2],       %[tp1]                         \n\t"
687            "extp               %[Temp4],    $ac2,            9             \n\t"
688
689            : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
690              [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
691              [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
692              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
693              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
694            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
695              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
696              [src_ptr] "r" (src_ptr)
697        );
698
699        /* clamp and store results */
700        dst_ptr[0] = cm[Temp1];
701        dst_ptr[1] = cm[Temp2];
702        dst_ptr[2] = cm[Temp3];
703        dst_ptr[3] = cm[Temp4];
704
705        /* next 4 pixels */
706        __asm__ __volatile__ (
707            /* even 3. pixel */
708            "dpa.w.ph           $ac3,        %[p3],           %[vector1b]   \n\t"
709            "dpa.w.ph           $ac3,        %[p1],           %[vector2b]   \n\t"
710            "dpa.w.ph           $ac3,        %[p2],           %[vector3b]   \n\t"
711
712            /* even 4. pixel */
713            "mtlo               %[vector4a], $ac2                           \n\t"
714            "preceu.ph.qbl      %[p4],       %[tp1]                         \n\t"
715            "dpa.w.ph           $ac2,        %[p1],           %[vector1b]   \n\t"
716            "dpa.w.ph           $ac2,        %[p2],           %[vector2b]   \n\t"
717            "dpa.w.ph           $ac2,        %[p4],           %[vector3b]   \n\t"
718            "ulw                %[tn1],      7(%[src_ptr])                  \n\t"
719            "extp               %[Temp1],    $ac3,            9             \n\t"
720
721            /* odd 3. pixel */
722            "mtlo               %[vector4a], $ac3                           \n\t"
723            "preceu.ph.qbr      %[n2],       %[tn1]                         \n\t"
724            "dpa.w.ph           $ac3,        %[n3],           %[vector1b]   \n\t"
725            "dpa.w.ph           $ac3,        %[n1],           %[vector2b]   \n\t"
726            "dpa.w.ph           $ac3,        %[n2],           %[vector3b]   \n\t"
727            "extp               %[Temp3],    $ac2,            9             \n\t"
728
729            /* odd 4. pixel */
730            "mtlo               %[vector4a], $ac2                           \n\t"
731            "preceu.ph.qbl      %[n4],       %[tn1]                         \n\t"
732            "dpa.w.ph           $ac2,        %[n1],           %[vector1b]   \n\t"
733            "dpa.w.ph           $ac2,        %[n2],           %[vector2b]   \n\t"
734            "dpa.w.ph           $ac2,        %[n4],           %[vector3b]   \n\t"
735            "ulw                %[tp2],      10(%[src_ptr])                 \n\t"
736            "extp               %[Temp2],    $ac3,            9             \n\t"
737            "mtlo               %[vector4a], $ac3                           \n\t"
738            "preceu.ph.qbr      %[p1],       %[tp2]                         \n\t"
739            "extp               %[Temp4],    $ac2,            9             \n\t"
740
741            : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2),
742              [p4] "=&r" (p4), [n4] "=&r" (n4),
743              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
744              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
745            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
746              [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1),
747              [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b),
748              [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
749        );
750
751        /* clamp and store results */
752        dst_ptr[4] = cm[Temp1];
753        dst_ptr[5] = cm[Temp2];
754        dst_ptr[6] = cm[Temp3];
755        dst_ptr[7] = cm[Temp4];
756
757        /* next 4 pixels */
758        __asm__ __volatile__ (
759            /* even 5. pixel */
760            "dpa.w.ph           $ac3,        %[p2],           %[vector1b]   \n\t"
761            "dpa.w.ph           $ac3,        %[p4],           %[vector2b]   \n\t"
762            "dpa.w.ph           $ac3,        %[p1],           %[vector3b]   \n\t"
763
764            /* even 6. pixel */
765            "mtlo               %[vector4a], $ac2                           \n\t"
766            "preceu.ph.qbl      %[p3],       %[tp2]                         \n\t"
767            "dpa.w.ph           $ac2,        %[p4],           %[vector1b]   \n\t"
768            "dpa.w.ph           $ac2,        %[p1],           %[vector2b]   \n\t"
769            "dpa.w.ph           $ac2,        %[p3],           %[vector3b]   \n\t"
770
771            "ulw                %[tn1],      11(%[src_ptr])                 \n\t"
772            "extp               %[Temp1],    $ac3,            9             \n\t"
773
774            /* odd 5. pixel */
775            "mtlo               %[vector4a], $ac3                           \n\t"
776            "preceu.ph.qbr      %[n1],       %[tn1]                         \n\t"
777            "dpa.w.ph           $ac3,        %[n2],           %[vector1b]   \n\t"
778            "dpa.w.ph           $ac3,        %[n4],           %[vector2b]   \n\t"
779            "dpa.w.ph           $ac3,        %[n1],           %[vector3b]   \n\t"
780            "extp               %[Temp3],    $ac2,            9             \n\t"
781
782            /* odd 6. pixel */
783            "mtlo               %[vector4a], $ac2                           \n\t"
784            "preceu.ph.qbl      %[n3],       %[tn1]                         \n\t"
785            "dpa.w.ph           $ac2,        %[n4],           %[vector1b]   \n\t"
786            "dpa.w.ph           $ac2,        %[n1],           %[vector2b]   \n\t"
787            "dpa.w.ph           $ac2,        %[n3],           %[vector3b]   \n\t"
788            "ulw                %[tp1],      14(%[src_ptr])                 \n\t"
789            "extp               %[Temp2],    $ac3,            9             \n\t"
790            "mtlo               %[vector4a], $ac3                           \n\t"
791            "preceu.ph.qbr      %[p4],       %[tp1]                         \n\t"
792            "extp               %[Temp4],    $ac2,            9             \n\t"
793
794            : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
795              [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3),
796              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
797              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
798            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
799              [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2),
800              [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr),
801              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b)
802        );
803
804        /* clamp and store results */
805        dst_ptr[8] = cm[Temp1];
806        dst_ptr[9] = cm[Temp2];
807        dst_ptr[10] = cm[Temp3];
808        dst_ptr[11] = cm[Temp4];
809
810        /* next 4 pixels */
811        __asm__ __volatile__ (
812            /* even 7. pixel */
813            "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
814            "dpa.w.ph           $ac3,        %[p3],           %[vector2b]   \n\t"
815            "dpa.w.ph           $ac3,        %[p4],           %[vector3b]   \n\t"
816
817            /* even 8. pixel */
818            "mtlo               %[vector4a], $ac2                           \n\t"
819            "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
820            "dpa.w.ph           $ac2,        %[p3],           %[vector1b]   \n\t"
821            "dpa.w.ph           $ac2,        %[p4],           %[vector2b]   \n\t"
822            "dpa.w.ph           $ac2,        %[p2],           %[vector3b]   \n\t"
823            "ulw                %[tn1],      15(%[src_ptr])                 \n\t"
824            "extp               %[Temp1],    $ac3,            9             \n\t"
825
826            /* odd 7. pixel */
827            "mtlo               %[vector4a], $ac3                           \n\t"
828            "preceu.ph.qbr      %[n4],       %[tn1]                         \n\t"
829            "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
830            "dpa.w.ph           $ac3,        %[n3],           %[vector2b]   \n\t"
831            "dpa.w.ph           $ac3,        %[n4],           %[vector3b]   \n\t"
832            "extp               %[Temp3],    $ac2,            9             \n\t"
833
834            /* odd 8. pixel */
835            "mtlo               %[vector4a], $ac2                           \n\t"
836            "preceu.ph.qbl      %[n2],       %[tn1]                         \n\t"
837            "dpa.w.ph           $ac2,        %[n3],           %[vector1b]   \n\t"
838            "dpa.w.ph           $ac2,        %[n4],           %[vector2b]   \n\t"
839            "dpa.w.ph           $ac2,        %[n2],           %[vector3b]   \n\t"
840            "extp               %[Temp2],    $ac3,            9             \n\t"
841            "extp               %[Temp4],    $ac2,            9             \n\t"
842
843            /* clamp and store results */
844            "lbux               %[tp1],      %[Temp1](%[cm])                \n\t"
845            "lbux               %[tn1],      %[Temp2](%[cm])                \n\t"
846            "lbux               %[p2],       %[Temp3](%[cm])                \n\t"
847            "sb                 %[tp1],      12(%[dst_ptr])                 \n\t"
848            "sb                 %[tn1],      13(%[dst_ptr])                 \n\t"
849            "lbux               %[n2],       %[Temp4](%[cm])                \n\t"
850            "sb                 %[p2],       14(%[dst_ptr])                 \n\t"
851            "sb                 %[n2],       15(%[dst_ptr])                 \n\t"
852
853            : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4),
854              [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
855              [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
856            : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
857              [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1),
858              [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3),
859              [n3] "r" (n3), [src_ptr] "r" (src_ptr),
860              [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
861        );
862
863        src_ptr += src_pixels_per_line;
864        dst_ptr += pitch;
865    }
866}
867
868
869void vp8_filter_block2d_first_pass16_0
870(
871    unsigned char *RESTRICT src_ptr,
872    unsigned char *RESTRICT output_ptr,
873    unsigned int src_pixels_per_line
874)
875{
876    int Temp1, Temp2, Temp3, Temp4;
877    int i;
878
879    /* prefetch src_ptr data to cache memory */
880    prefetch_store(output_ptr + 32);
881
882    /* copy memory from src buffer to dst buffer */
883    for (i = 0; i < 7; i++)
884    {
885        __asm__ __volatile__ (
886            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
887            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
888            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
889            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
890            "sw     %[Temp1],   0(%[output_ptr])                            \n\t"
891            "sw     %[Temp2],   4(%[output_ptr])                            \n\t"
892            "sw     %[Temp3],   8(%[output_ptr])                            \n\t"
893            "sw     %[Temp4],   12(%[output_ptr])                           \n\t"
894            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
895
896            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
897              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
898            : [src_pixels_per_line] "r" (src_pixels_per_line),
899              [output_ptr] "r" (output_ptr)
900        );
901
902        __asm__ __volatile__ (
903            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
904            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
905            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
906            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
907            "sw     %[Temp1],   16(%[output_ptr])                           \n\t"
908            "sw     %[Temp2],   20(%[output_ptr])                           \n\t"
909            "sw     %[Temp3],   24(%[output_ptr])                           \n\t"
910            "sw     %[Temp4],   28(%[output_ptr])                           \n\t"
911            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
912
913            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
914              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
915            : [src_pixels_per_line] "r" (src_pixels_per_line),
916              [output_ptr] "r" (output_ptr)
917        );
918
919        __asm__ __volatile__ (
920            "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
921            "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
922            "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
923            "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
924            "sw     %[Temp1],   32(%[output_ptr])                           \n\t"
925            "sw     %[Temp2],   36(%[output_ptr])                           \n\t"
926            "sw     %[Temp3],   40(%[output_ptr])                           \n\t"
927            "sw     %[Temp4],   44(%[output_ptr])                           \n\t"
928            "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
929
930            : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
931              [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
932            : [src_pixels_per_line] "r" (src_pixels_per_line),
933              [output_ptr] "r" (output_ptr)
934        );
935
936        output_ptr += 48;
937    }
938}
939
940
941void vp8_filter_block2d_first_pass16_4tap
942(
943    unsigned char *RESTRICT src_ptr,
944    unsigned char *RESTRICT output_ptr,
945    unsigned int src_pixels_per_line,
946    unsigned int output_width,
947    unsigned int output_height,
948    int xoffset,
949    int yoffset,
950    unsigned char *RESTRICT dst_ptr,
951    int pitch
952)
953{
954    unsigned int i, j;
955    int Temp1, Temp2, Temp3, Temp4;
956
957    unsigned int vector4a;
958    int vector1b, vector2b;
959    unsigned int tp1, tp2, tp3, tn1;
960    unsigned int p1, p2, p3;
961    unsigned int n1, n2, n3;
962    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
963
964    vector4a = 64;
965
966    vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
967    vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
968
969    /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
970    if (yoffset == 0)
971    {
972        output_height -= 5;
973        src_ptr += (src_pixels_per_line + src_pixels_per_line);
974
975        for (i = output_height; i--;)
976        {
977            __asm__ __volatile__ (
978                "ulw     %[tp3],   -1(%[src_ptr])               \n\t"
979                : [tp3] "=&r" (tp3)
980                : [src_ptr] "r" (src_ptr)
981            );
982
983            /* processing 4 adjacent pixels */
984            for (j = 0; j < 16; j += 4)
985            {
986                /* apply filter with vectors pairs */
987                __asm__ __volatile__ (
988                    "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
989                    "move             %[tp1],      %[tp3]                           \n\t"
990
991                    /* even 1. pixel */
992                    "mtlo             %[vector4a], $ac3                             \n\t"
993                    "mthi             $0,          $ac3                             \n\t"
994                    "move             %[tp3],      %[tp2]                           \n\t"
995                    "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
996                    "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
997                    "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
998                    "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
999                    "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
1000
1001                    /* even 2. pixel */
1002                    "mtlo             %[vector4a], $ac2                             \n\t"
1003                    "mthi             $0,          $ac2                             \n\t"
1004                    "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
1005                    "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
1006                    "extr.w           %[Temp1],    $ac3,            7               \n\t"
1007
1008                    /* odd 1. pixel */
1009                    "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
1010                    "balign           %[tp2],      %[tp1],          3               \n\t"
1011                    "mtlo             %[vector4a], $ac3                             \n\t"
1012                    "mthi             $0,          $ac3                             \n\t"
1013                    "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
1014                    "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
1015                    "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
1016                    "extr.w           %[Temp3],    $ac2,            7               \n\t"
1017                    "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
1018                    "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
1019
1020                    /* odd 2. pixel */
1021                    "mtlo             %[vector4a], $ac2                             \n\t"
1022                    "mthi             $0,          $ac2                             \n\t"
1023                    "extr.w           %[Temp2],    $ac3,            7               \n\t"
1024                    "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
1025                    "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
1026                    "extr.w           %[Temp4],    $ac2,            7               \n\t"
1027
1028                    /* clamp and store results */
1029                    "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
1030                    "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
1031                    "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
1032                    "sb               %[tp1],      0(%[dst_ptr])                    \n\t"
1033                    "sb               %[tn1],      1(%[dst_ptr])                    \n\t"
1034                    "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
1035                    "sb               %[tp2],      2(%[dst_ptr])                    \n\t"
1036                    "sb               %[n2],       3(%[dst_ptr])                    \n\t"
1037
1038                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
1039                      [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2),
1040                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
1041                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3),
1042                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
1043                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1044                      [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
1045                      [src_ptr] "r" (src_ptr)
1046                );
1047
1048                src_ptr += 4;
1049            }
1050
1051            /* Next row... */
1052            src_ptr += src_pixels_per_line - 16;
1053            dst_ptr += pitch;
1054        }
1055    }
1056    else
1057    {
1058        for (i = output_height; i--;)
1059        {
1060            /* processing 4 adjacent pixels */
1061            for (j = 0; j < 16; j += 4)
1062            {
1063                /* apply filter with vectors pairs */
1064                __asm__ __volatile__ (
1065                    "ulw              %[tp1],      -1(%[src_ptr])                   \n\t"
1066                    "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
1067
1068                    /* even 1. pixel */
1069                    "mtlo             %[vector4a], $ac3                             \n\t"
1070                    "mthi             $0,          $ac3                             \n\t"
1071                    "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
1072                    "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
1073                    "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
1074                    "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
1075                    "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
1076
1077                    /* even 2. pixel */
1078                    "mtlo             %[vector4a], $ac2                             \n\t"
1079                    "mthi             $0,          $ac2                             \n\t"
1080                    "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
1081                    "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
1082                    "extr.w           %[Temp1],    $ac3,            7               \n\t"
1083
1084                    /* odd 1. pixel */
1085                    "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
1086                    "balign           %[tp2],      %[tp1],          3               \n\t"
1087                    "mtlo             %[vector4a], $ac3                             \n\t"
1088                    "mthi             $0,          $ac3                             \n\t"
1089                    "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
1090                    "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
1091                    "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
1092                    "extr.w           %[Temp3],    $ac2,            7               \n\t"
1093                    "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
1094                    "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
1095
1096                    /* odd 2. pixel */
1097                    "mtlo             %[vector4a], $ac2                             \n\t"
1098                    "mthi             $0,          $ac2                             \n\t"
1099                    "extr.w           %[Temp2],    $ac3,            7               \n\t"
1100                    "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
1101                    "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
1102                    "extr.w           %[Temp4],    $ac2,            7               \n\t"
1103
1104                    /* clamp and store results */
1105                    "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
1106                    "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
1107                    "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
1108                    "sb               %[tp1],      0(%[output_ptr])                 \n\t"
1109                    "sb               %[tn1],      1(%[output_ptr])                 \n\t"
1110                    "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
1111                    "sb               %[tp2],      2(%[output_ptr])                 \n\t"
1112                    "sb               %[n2],       3(%[output_ptr])                 \n\t"
1113
1114                    : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
1115                      [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
1116                      [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
1117                      [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1118                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
1119                    : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1120                      [vector4a] "r" (vector4a), [cm] "r" (cm),
1121                      [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr)
1122                );
1123
1124                src_ptr += 4;
1125            }
1126
1127            /* next row... */
1128            src_ptr += src_pixels_per_line;
1129            output_ptr += output_width;
1130        }
1131    }
1132}
1133
1134
1135void vp8_filter_block2d_second_pass4
1136(
1137    unsigned char *RESTRICT src_ptr,
1138    unsigned char *RESTRICT output_ptr,
1139    int output_pitch,
1140    int yoffset
1141)
1142{
1143    unsigned int i;
1144
1145    int Temp1, Temp2, Temp3, Temp4;
1146    unsigned int vector1b, vector2b, vector3b, vector4a;
1147
1148    unsigned char src_ptr_l2;
1149    unsigned char src_ptr_l1;
1150    unsigned char src_ptr_0;
1151    unsigned char src_ptr_r1;
1152    unsigned char src_ptr_r2;
1153    unsigned char src_ptr_r3;
1154
1155    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1156
1157    vector4a = 64;
1158
1159    /* load filter coefficients */
1160    vector1b = sub_pel_filterss[yoffset][0];
1161    vector2b = sub_pel_filterss[yoffset][2];
1162    vector3b = sub_pel_filterss[yoffset][1];
1163
1164    if (vector1b)
1165    {
1166        /* 6 tap filter */
1167
1168        for (i = 2; i--;)
1169        {
1170            /* prefetch src_ptr data to cache memory */
1171            prefetch_load(src_ptr);
1172
1173            /* do not allow compiler to reorder instructions */
1174            __asm__ __volatile__ (
1175                ".set noreorder                                                 \n\t"
1176                :
1177                :
1178            );
1179
1180            /* apply filter with vectors pairs */
1181            __asm__ __volatile__ (
1182                "lbu            %[src_ptr_l2],  -8(%[src_ptr])                  \n\t"
1183                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1184                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1185                "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
1186                "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
1187                "lbu            %[src_ptr_r3],  12(%[src_ptr])                  \n\t"
1188                "mtlo           %[vector4a],    $ac2                            \n\t"
1189
1190                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1191                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1192                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1193                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1194                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1195                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1196
1197                "lbu            %[src_ptr_l2],  -7(%[src_ptr])                  \n\t"
1198                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1199                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1200                "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
1201                "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
1202                "lbu            %[src_ptr_r3],  13(%[src_ptr])                  \n\t"
1203                "mtlo           %[vector4a],    $ac3                            \n\t"
1204                "extp           %[Temp1],       $ac2,           9               \n\t"
1205
1206                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1207                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1208                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1209                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1210                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1211                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1212
1213                "lbu            %[src_ptr_l2],  -6(%[src_ptr])                  \n\t"
1214                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1215                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1216                "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
1217                "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
1218                "lbu            %[src_ptr_r3],  14(%[src_ptr])                  \n\t"
1219                "mtlo           %[vector4a],    $ac0                            \n\t"
1220                "extp           %[Temp2],       $ac3,           9               \n\t"
1221
1222                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1223                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1224                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1225                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1226                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1227                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1228
1229                "lbu            %[src_ptr_l2],  -5(%[src_ptr])                  \n\t"
1230                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1231                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1232                "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
1233                "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
1234                "lbu            %[src_ptr_r3],  15(%[src_ptr])                  \n\t"
1235                "mtlo           %[vector4a],    $ac1                            \n\t"
1236                "extp           %[Temp3],       $ac0,           9               \n\t"
1237
1238                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1239                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1240                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1241                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1242                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1243                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1244                "extp           %[Temp4],       $ac1,           9               \n\t"
1245
1246                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1247                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1248                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1249                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1250                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1251                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1252                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1253                  [src_ptr] "r" (src_ptr)
1254            );
1255
1256            /* clamp and store results */
1257            output_ptr[0] = cm[Temp1];
1258            output_ptr[1] = cm[Temp2];
1259            output_ptr[2] = cm[Temp3];
1260            output_ptr[3] = cm[Temp4];
1261
1262            output_ptr += output_pitch;
1263
1264            /* apply filter with vectors pairs */
1265            __asm__ __volatile__ (
1266                "lbu            %[src_ptr_l2],  -4(%[src_ptr])                  \n\t"
1267                "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
1268                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1269                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1270                "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
1271                "lbu            %[src_ptr_r3],  16(%[src_ptr])                  \n\t"
1272                "mtlo           %[vector4a],    $ac2                            \n\t"
1273                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1274                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1275                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1276                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1277                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1278                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1279
1280                "lbu            %[src_ptr_l2],  -3(%[src_ptr])                  \n\t"
1281                "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
1282                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1283                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1284                "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
1285                "lbu            %[src_ptr_r3],  17(%[src_ptr])                  \n\t"
1286                "mtlo           %[vector4a],    $ac3                            \n\t"
1287                "extp           %[Temp1],       $ac2,           9               \n\t"
1288
1289                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1290                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1291                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1292                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1293                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1294                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1295
1296                "lbu            %[src_ptr_l2],  -2(%[src_ptr])                  \n\t"
1297                "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
1298                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1299                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1300                "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
1301                "lbu            %[src_ptr_r3],  18(%[src_ptr])                  \n\t"
1302                "mtlo           %[vector4a],    $ac0                            \n\t"
1303                "extp           %[Temp2],       $ac3,           9               \n\t"
1304
1305                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1306                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1307                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1308                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1309                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1310                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1311
1312                "lbu            %[src_ptr_l2],  -1(%[src_ptr])                  \n\t"
1313                "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
1314                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1315                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1316                "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
1317                "lbu            %[src_ptr_r3],  19(%[src_ptr])                  \n\t"
1318                "mtlo           %[vector4a],    $ac1                            \n\t"
1319                "extp           %[Temp3],       $ac0,           9               \n\t"
1320
1321                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1322                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1323                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1324                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1325                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1326                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1327                "extp           %[Temp4],       $ac1,           9               \n\t"
1328
1329                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1330                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1331                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1332                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1333                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1334                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1335                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1336                  [src_ptr] "r" (src_ptr)
1337            );
1338
1339            /* clamp and store results */
1340            output_ptr[0] = cm[Temp1];
1341            output_ptr[1] = cm[Temp2];
1342            output_ptr[2] = cm[Temp3];
1343            output_ptr[3] = cm[Temp4];
1344
1345            src_ptr += 8;
1346            output_ptr += output_pitch;
1347        }
1348    }
1349    else
1350    {
1351        /* 4 tap filter */
1352
1353        /* prefetch src_ptr data to cache memory */
1354        prefetch_load(src_ptr);
1355
1356        for (i = 2; i--;)
1357        {
1358            /* do not allow compiler to reorder instructions */
1359            __asm__ __volatile__ (
1360                ".set noreorder                                                 \n\t"
1361                :
1362                :
1363            );
1364
1365            /* apply filter with vectors pairs */
1366            __asm__ __volatile__ (
1367                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1368                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1369                "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
1370                "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
1371                "mtlo           %[vector4a],    $ac2                            \n\t"
1372                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1373                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1374                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1375                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1376
1377                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1378                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1379                "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
1380                "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
1381                "mtlo           %[vector4a],    $ac3                            \n\t"
1382                "extp           %[Temp1],       $ac2,           9               \n\t"
1383
1384                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1385                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1386                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1387                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1388
1389                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1390                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1391                "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
1392                "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
1393                "mtlo           %[vector4a],    $ac0                            \n\t"
1394                "extp           %[Temp2],       $ac3,           9               \n\t"
1395
1396                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1397                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1398                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1399                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1400
1401                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1402                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1403                "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
1404                "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
1405                "mtlo           %[vector4a],    $ac1                            \n\t"
1406                "extp           %[Temp3],       $ac0,           9               \n\t"
1407                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1408                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1409                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1410                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1411                "extp           %[Temp4],       $ac1,           9               \n\t"
1412
1413                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1414                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1415                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1416                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1417                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1418                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1419            );
1420
1421            /* clamp and store results */
1422            output_ptr[0] = cm[Temp1];
1423            output_ptr[1] = cm[Temp2];
1424            output_ptr[2] = cm[Temp3];
1425            output_ptr[3] = cm[Temp4];
1426
1427            output_ptr += output_pitch;
1428
1429            /* apply filter with vectors pairs */
1430            __asm__ __volatile__ (
1431                "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
1432                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1433                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1434                "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
1435                "mtlo           %[vector4a],    $ac2                            \n\t"
1436                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1437                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1438                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1439                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1440
1441                "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
1442                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1443                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1444                "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
1445                "mtlo           %[vector4a],    $ac3                            \n\t"
1446                "extp           %[Temp1],       $ac2,           9               \n\t"
1447
1448                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1449                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1450                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1451                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1452
1453                "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
1454                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1455                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1456                "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
1457                "mtlo           %[vector4a],    $ac0                            \n\t"
1458                "extp           %[Temp2],       $ac3,           9               \n\t"
1459
1460                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1461                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1462                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1463                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1464
1465                "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
1466                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1467                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1468                "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
1469                "mtlo           %[vector4a],    $ac1                            \n\t"
1470                "extp           %[Temp3],       $ac0,           9               \n\t"
1471                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1472                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1473                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1474                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1475                "extp           %[Temp4],       $ac1,           9               \n\t"
1476
1477                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1478                  [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1479                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1480                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1481                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1482                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1483            );
1484
1485            /* clamp and store results */
1486            output_ptr[0] = cm[Temp1];
1487            output_ptr[1] = cm[Temp2];
1488            output_ptr[2] = cm[Temp3];
1489            output_ptr[3] = cm[Temp4];
1490
1491            src_ptr += 8;
1492            output_ptr += output_pitch;
1493        }
1494    }
1495}
1496
1497
1498void vp8_filter_block2d_second_pass_8
1499(
1500    unsigned char *RESTRICT src_ptr,
1501    unsigned char *RESTRICT output_ptr,
1502    int output_pitch,
1503    unsigned int output_height,
1504    unsigned int output_width,
1505    unsigned int yoffset
1506)
1507{
1508    unsigned int i;
1509
1510    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1511    unsigned int vector1b, vector2b, vector3b, vector4a;
1512
1513    unsigned char src_ptr_l2;
1514    unsigned char src_ptr_l1;
1515    unsigned char src_ptr_0;
1516    unsigned char src_ptr_r1;
1517    unsigned char src_ptr_r2;
1518    unsigned char src_ptr_r3;
1519    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1520
1521    vector4a = 64;
1522
1523    vector1b = sub_pel_filterss[yoffset][0];
1524    vector2b = sub_pel_filterss[yoffset][2];
1525    vector3b = sub_pel_filterss[yoffset][1];
1526
1527    if (vector1b)
1528    {
1529        /* 6 tap filter */
1530
1531        /* prefetch src_ptr data to cache memory */
1532        prefetch_load(src_ptr);
1533
1534        for (i = output_height; i--;)
1535        {
1536            /* apply filter with vectors pairs */
1537            __asm__ __volatile__ (
1538                "lbu            %[src_ptr_l2],  -16(%[src_ptr])                 \n\t"
1539                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
1540                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1541                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1542                "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
1543                "lbu            %[src_ptr_r3],  24(%[src_ptr])                  \n\t"
1544                "mtlo           %[vector4a],    $ac2                            \n\t"
1545
1546                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1547                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1548                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1549                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1550                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1551                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1552
1553                "lbu            %[src_ptr_l2],  -15(%[src_ptr])                 \n\t"
1554                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
1555                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1556                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1557                "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
1558                "lbu            %[src_ptr_r3],  25(%[src_ptr])                  \n\t"
1559                "mtlo           %[vector4a],    $ac3                            \n\t"
1560                "extp           %[Temp1],       $ac2,           9               \n\t"
1561
1562                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1563                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1564                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1565                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1566                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1567                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1568
1569                "lbu            %[src_ptr_l2],  -14(%[src_ptr])                 \n\t"
1570                "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
1571                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1572                "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1573                "lbu            %[src_ptr_r2],  18(%[src_ptr])                  \n\t"
1574                "lbu            %[src_ptr_r3],  26(%[src_ptr])                  \n\t"
1575                "mtlo           %[vector4a],    $ac0                            \n\t"
1576                "extp           %[Temp2],       $ac3,           9               \n\t"
1577
1578                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1579                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1580                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1581                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1582                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1583                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1584
1585                "lbu            %[src_ptr_l2],  -13(%[src_ptr])                 \n\t"
1586                "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
1587                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1588                "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1589                "lbu            %[src_ptr_r2],  19(%[src_ptr])                  \n\t"
1590                "lbu            %[src_ptr_r3],  27(%[src_ptr])                  \n\t"
1591                "mtlo           %[vector4a],    $ac1                            \n\t"
1592                "extp           %[Temp3],       $ac0,           9               \n\t"
1593
1594                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1595                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1596                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1597                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1598                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1599                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1600
1601                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
1602                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1603                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1604                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1605                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1606                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1607                  [src_ptr] "r" (src_ptr)
1608            );
1609
1610            /* apply filter with vectors pairs */
1611            __asm__ __volatile__ (
1612                "lbu            %[src_ptr_l2],  -12(%[src_ptr])                 \n\t"
1613                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1614                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1615                "lbu            %[src_ptr_r1],  12(%[src_ptr])                  \n\t"
1616                "lbu            %[src_ptr_r2],  20(%[src_ptr])                  \n\t"
1617                "lbu            %[src_ptr_r3],  28(%[src_ptr])                  \n\t"
1618                "mtlo           %[vector4a],    $ac2                            \n\t"
1619
1620                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1621                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1622                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1623                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1624                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1625                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1626                "extp           %[Temp4],       $ac1,           9               \n\t"
1627
1628                "lbu            %[src_ptr_l2],  -11(%[src_ptr])                 \n\t"
1629                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1630                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1631                "lbu            %[src_ptr_r1],  13(%[src_ptr])                  \n\t"
1632                "lbu            %[src_ptr_r2],  21(%[src_ptr])                  \n\t"
1633                "lbu            %[src_ptr_r3],  29(%[src_ptr])                  \n\t"
1634                "mtlo           %[vector4a],    $ac3                            \n\t"
1635                "extp           %[Temp5],       $ac2,           9               \n\t"
1636
1637                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1638                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1639                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1640                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1641                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1642                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1643
1644                "lbu            %[src_ptr_l2],  -10(%[src_ptr])                 \n\t"
1645                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1646                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1647                "lbu            %[src_ptr_r1],  14(%[src_ptr])                  \n\t"
1648                "lbu            %[src_ptr_r2],  22(%[src_ptr])                  \n\t"
1649                "lbu            %[src_ptr_r3],  30(%[src_ptr])                  \n\t"
1650                "mtlo           %[vector4a],    $ac0                            \n\t"
1651                "extp           %[Temp6],       $ac3,           9               \n\t"
1652
1653                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1654                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1655                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1656                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1657                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1658                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1659
1660                "lbu            %[src_ptr_l2],  -9(%[src_ptr])                  \n\t"
1661                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1662                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1663                "lbu            %[src_ptr_r1],  15(%[src_ptr])                  \n\t"
1664                "lbu            %[src_ptr_r2],  23(%[src_ptr])                  \n\t"
1665                "lbu            %[src_ptr_r3],  31(%[src_ptr])                  \n\t"
1666                "mtlo           %[vector4a],    $ac1                            \n\t"
1667                "extp           %[Temp7],       $ac0,           9               \n\t"
1668
1669                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1670                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1671                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1672                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1673                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1674                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1675                "extp           %[Temp8],       $ac1,           9               \n\t"
1676
1677                : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5),
1678                  [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
1679                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1680                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1681                  [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
1682                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1683                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1684                  [src_ptr] "r" (src_ptr)
1685            );
1686
1687            /* clamp and store results */
1688            output_ptr[0] = cm[Temp1];
1689            output_ptr[1] = cm[Temp2];
1690            output_ptr[2] = cm[Temp3];
1691            output_ptr[3] = cm[Temp4];
1692            output_ptr[4] = cm[Temp5];
1693            output_ptr[5] = cm[Temp6];
1694            output_ptr[6] = cm[Temp7];
1695            output_ptr[7] = cm[Temp8];
1696
1697            src_ptr += 8;
1698            output_ptr += output_pitch;
1699        }
1700    }
1701    else
1702    {
1703        /* 4 tap filter */
1704
1705        /* prefetch src_ptr data to cache memory */
1706        prefetch_load(src_ptr);
1707
1708        for (i = output_height; i--;)
1709        {
1710            __asm__ __volatile__ (
1711                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
1712                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1713                "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1714                "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
1715                "mtlo           %[vector4a],    $ac2                            \n\t"
1716                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1717                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1718                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1719                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1720
1721                : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1722                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1723                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1724                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1725            );
1726
1727            __asm__ __volatile__ (
1728                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
1729                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1730                "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1731                "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
1732                "mtlo           %[vector4a],    $ac3                            \n\t"
1733                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1734                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1735                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1736                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1737                "extp           %[Temp1],       $ac2,           9               \n\t"
1738
1739                : [Temp1] "=r" (Temp1),
1740                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1741                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1742                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1743                  [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1744            );
1745
1746            src_ptr_l1 = src_ptr[-6];
1747            src_ptr_0  = src_ptr[2];
1748            src_ptr_r1 = src_ptr[10];
1749            src_ptr_r2 = src_ptr[18];
1750
1751            __asm__ __volatile__ (
1752                "mtlo           %[vector4a],    $ac0                            \n\t"
1753                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1754                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1755                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1756                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1757                "extp           %[Temp2],       $ac3,           9               \n\t"
1758
1759                : [Temp2] "=r" (Temp2)
1760                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1761                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1762                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1763                  [vector4a] "r" (vector4a)
1764            );
1765
1766            src_ptr_l1 = src_ptr[-5];
1767            src_ptr_0  = src_ptr[3];
1768            src_ptr_r1 = src_ptr[11];
1769            src_ptr_r2 = src_ptr[19];
1770
1771            __asm__ __volatile__ (
1772                "mtlo           %[vector4a],    $ac1                            \n\t"
1773                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1774                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1775                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1776                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1777                "extp           %[Temp3],       $ac0,           9               \n\t"
1778
1779                : [Temp3] "=r" (Temp3)
1780                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1781                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1782                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1783                  [vector4a] "r" (vector4a)
1784            );
1785
1786            src_ptr_l1 = src_ptr[-4];
1787            src_ptr_0  = src_ptr[4];
1788            src_ptr_r1 = src_ptr[12];
1789            src_ptr_r2 = src_ptr[20];
1790
1791            __asm__ __volatile__ (
1792                "mtlo           %[vector4a],    $ac2                            \n\t"
1793                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1794                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1795                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1796                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1797                "extp           %[Temp4],       $ac1,           9               \n\t"
1798
1799                : [Temp4] "=r" (Temp4)
1800                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1801                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1802                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1803                  [vector4a] "r" (vector4a)
1804            );
1805
1806            src_ptr_l1 = src_ptr[-3];
1807            src_ptr_0  = src_ptr[5];
1808            src_ptr_r1 = src_ptr[13];
1809            src_ptr_r2 = src_ptr[21];
1810
1811            __asm__ __volatile__ (
1812                "mtlo           %[vector4a],    $ac3                            \n\t"
1813                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1814                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1815                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1816                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1817                "extp           %[Temp5],       $ac2,           9               \n\t"
1818
1819                : [Temp5] "=&r" (Temp5)
1820                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1821                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1822                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1823                  [vector4a] "r" (vector4a)
1824            );
1825
1826            src_ptr_l1 = src_ptr[-2];
1827            src_ptr_0  = src_ptr[6];
1828            src_ptr_r1 = src_ptr[14];
1829            src_ptr_r2 = src_ptr[22];
1830
1831            __asm__ __volatile__ (
1832                "mtlo           %[vector4a],    $ac0                            \n\t"
1833                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1834                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1835                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1836                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1837                "extp           %[Temp6],       $ac3,           9               \n\t"
1838
1839                : [Temp6] "=r" (Temp6)
1840                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1841                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1842                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1843                  [vector4a] "r" (vector4a)
1844            );
1845
1846            src_ptr_l1 = src_ptr[-1];
1847            src_ptr_0  = src_ptr[7];
1848            src_ptr_r1 = src_ptr[15];
1849            src_ptr_r2 = src_ptr[23];
1850
1851            __asm__ __volatile__ (
1852                "mtlo           %[vector4a],    $ac1                            \n\t"
1853                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1854                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1855                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1856                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1857                "extp           %[Temp7],       $ac0,           9               \n\t"
1858                "extp           %[Temp8],       $ac1,           9               \n\t"
1859
1860                : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8)
1861                : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1862                  [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1863                  [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1864                  [vector4a] "r" (vector4a)
1865            );
1866
1867            /* clamp and store results */
1868            output_ptr[0] = cm[Temp1];
1869            output_ptr[1] = cm[Temp2];
1870            output_ptr[2] = cm[Temp3];
1871            output_ptr[3] = cm[Temp4];
1872            output_ptr[4] = cm[Temp5];
1873            output_ptr[5] = cm[Temp6];
1874            output_ptr[6] = cm[Temp7];
1875            output_ptr[7] = cm[Temp8];
1876
1877            src_ptr += 8;
1878            output_ptr += output_pitch;
1879        }
1880    }
1881}
1882
1883
1884void vp8_filter_block2d_second_pass161
1885(
1886    unsigned char *RESTRICT src_ptr,
1887    unsigned char *RESTRICT output_ptr,
1888    int output_pitch,
1889    const unsigned short *vp8_filter
1890)
1891{
1892    unsigned int i, j;
1893
1894    int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1895    unsigned int vector4a;
1896    unsigned int vector1b, vector2b, vector3b;
1897
1898    unsigned char src_ptr_l2;
1899    unsigned char src_ptr_l1;
1900    unsigned char src_ptr_0;
1901    unsigned char src_ptr_r1;
1902    unsigned char src_ptr_r2;
1903    unsigned char src_ptr_r3;
1904    unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1905
1906    vector4a = 64;
1907
1908    vector1b = vp8_filter[0];
1909    vector2b = vp8_filter[2];
1910    vector3b = vp8_filter[1];
1911
1912    if (vector1b == 0)
1913    {
1914        /* 4 tap filter */
1915
1916        /* prefetch src_ptr data to cache memory */
1917        prefetch_load(src_ptr + 16);
1918
1919        for (i = 16; i--;)
1920        {
1921            /* unrolling for loop */
1922            for (j = 0; j < 16; j += 8)
1923            {
1924                /* apply filter with vectors pairs */
1925                __asm__ __volatile__ (
1926                    "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
1927                    "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1928                    "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
1929                    "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
1930                    "mtlo           %[vector4a],    $ac2                            \n\t"
1931                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1932                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1933                    "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1934                    "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1935
1936                    "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
1937                    "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1938                    "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
1939                    "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
1940                    "mtlo           %[vector4a],    $ac3                            \n\t"
1941                    "extp           %[Temp1],       $ac2,           9               \n\t"
1942
1943                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1944                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1945                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1946                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1947
1948                    "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
1949                    "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1950                    "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
1951                    "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
1952                    "mtlo           %[vector4a],    $ac1                            \n\t"
1953                    "extp           %[Temp2],       $ac3,           9               \n\t"
1954
1955                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1956                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1957                    "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1958                    "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1959
1960                    "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
1961                    "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1962                    "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
1963                    "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
1964                    "mtlo           %[vector4a],    $ac3                            \n\t"
1965                    "extp           %[Temp3],       $ac1,           9               \n\t"
1966
1967                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1968                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1969                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1970                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1971
1972                    "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
1973                    "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1974                    "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
1975                    "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
1976                    "mtlo           %[vector4a],    $ac2                            \n\t"
1977                    "extp           %[Temp4],       $ac3,           9               \n\t"
1978
1979                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1980                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1981                    "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1982                    "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1983
1984                    "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
1985                    "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1986                    "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
1987                    "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
1988                    "mtlo           %[vector4a],    $ac3                            \n\t"
1989                    "extp           %[Temp5],       $ac2,           9               \n\t"
1990
1991                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1992                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1993                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1994                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1995
1996                    "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
1997                    "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1998                    "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
1999                    "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
2000                    "mtlo           %[vector4a],    $ac1                            \n\t"
2001                    "extp           %[Temp6],       $ac3,           9               \n\t"
2002
2003                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2004                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2005                    "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2006                    "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2007
2008                    "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
2009                    "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
2010                    "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
2011                    "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
2012                    "mtlo           %[vector4a],    $ac3                            \n\t"
2013                    "extp           %[Temp7],       $ac1,           9               \n\t"
2014
2015                    "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2016                    "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2017                    "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2018                    "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2019                    "extp           %[Temp8],       $ac3,           9               \n\t"
2020
2021                    : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2022                      [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2023                      [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2024                      [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2025                      [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2026                      [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
2027                    : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
2028                      [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
2029                );
2030
2031                /* clamp and store results */
2032                output_ptr[j] = cm[Temp1];
2033                output_ptr[j + 1] = cm[Temp2];
2034                output_ptr[j + 2] = cm[Temp3];
2035                output_ptr[j + 3] = cm[Temp4];
2036                output_ptr[j + 4] = cm[Temp5];
2037                output_ptr[j + 5] = cm[Temp6];
2038                output_ptr[j + 6] = cm[Temp7];
2039                output_ptr[j + 7] = cm[Temp8];
2040
2041                src_ptr += 8;
2042            }
2043
2044            output_ptr += output_pitch;
2045        }
2046    }
2047    else
2048    {
2049        /* 4 tap filter */
2050
2051        /* prefetch src_ptr data to cache memory */
2052        prefetch_load(src_ptr + 16);
2053
2054        /* unroll for loop */
2055        for (i = 16; i--;)
2056        {
2057            /* apply filter with vectors pairs */
2058            __asm__ __volatile__ (
2059                "lbu            %[src_ptr_l2],  -32(%[src_ptr])                 \n\t"
2060                "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
2061                "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
2062                "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
2063                "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
2064                "lbu            %[src_ptr_r3],  48(%[src_ptr])                  \n\t"
2065                "mtlo           %[vector4a],    $ac2                            \n\t"
2066
2067                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2068                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2069                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2070                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2071                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2072                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2073
2074                "lbu            %[src_ptr_l2],  -31(%[src_ptr])                 \n\t"
2075                "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
2076                "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
2077                "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
2078                "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
2079                "lbu            %[src_ptr_r3],  49(%[src_ptr])                  \n\t"
2080                "mtlo           %[vector4a],    $ac0                            \n\t"
2081                "extp           %[Temp1],       $ac2,           9               \n\t"
2082
2083                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2084                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2085                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2086                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2087                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2088                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2089
2090                "lbu            %[src_ptr_l2],  -30(%[src_ptr])                 \n\t"
2091                "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
2092                "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
2093                "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
2094                "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
2095                "lbu            %[src_ptr_r3],  50(%[src_ptr])                  \n\t"
2096                "mtlo           %[vector4a],    $ac1                            \n\t"
2097                "extp           %[Temp2],       $ac0,           9               \n\t"
2098
2099                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2100                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2101                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2102                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2103                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2104                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2105
2106                "lbu            %[src_ptr_l2],  -29(%[src_ptr])                 \n\t"
2107                "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
2108                "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
2109                "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
2110                "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
2111                "lbu            %[src_ptr_r3],  51(%[src_ptr])                  \n\t"
2112                "mtlo           %[vector4a],    $ac3                            \n\t"
2113                "extp           %[Temp3],       $ac1,           9               \n\t"
2114
2115                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2116                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2117                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2118                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2119                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2120                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2121
2122                "lbu            %[src_ptr_l2],  -28(%[src_ptr])                 \n\t"
2123                "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
2124                "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
2125                "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
2126                "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
2127                "lbu            %[src_ptr_r3],  52(%[src_ptr])                  \n\t"
2128                "mtlo           %[vector4a],    $ac2                            \n\t"
2129                "extp           %[Temp4],       $ac3,           9               \n\t"
2130
2131                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2132                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2133                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2134                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2135                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2136                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2137
2138                "lbu            %[src_ptr_l2],  -27(%[src_ptr])                 \n\t"
2139                "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
2140                "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
2141                "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
2142                "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
2143                "lbu            %[src_ptr_r3],  53(%[src_ptr])                  \n\t"
2144                "mtlo           %[vector4a],    $ac0                            \n\t"
2145                "extp           %[Temp5],       $ac2,           9               \n\t"
2146
2147                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2148                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2149                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2150                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2151                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2152                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2153
2154                "lbu            %[src_ptr_l2],  -26(%[src_ptr])                 \n\t"
2155                "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
2156                "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
2157                "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
2158                "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
2159                "lbu            %[src_ptr_r3],  54(%[src_ptr])                  \n\t"
2160                "mtlo           %[vector4a],    $ac1                            \n\t"
2161                "extp           %[Temp6],       $ac0,           9               \n\t"
2162
2163                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2164                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2165                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2166                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2167                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2168                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2169
2170                "lbu            %[src_ptr_l2],  -25(%[src_ptr])                 \n\t"
2171                "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
2172                "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
2173                "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
2174                "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
2175                "lbu            %[src_ptr_r3],  55(%[src_ptr])                  \n\t"
2176                "mtlo           %[vector4a],    $ac3                            \n\t"
2177                "extp           %[Temp7],       $ac1,           9               \n\t"
2178
2179                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2180                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2181                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2182                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2183                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2184                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2185                "extp           %[Temp8],       $ac3,           9               \n\t"
2186
2187                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2188                  [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2189                  [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2190                  [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2191                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2192                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
2193                  [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
2194                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
2195                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
2196                  [src_ptr] "r" (src_ptr)
2197            );
2198
2199            /* clamp and store results */
2200            output_ptr[0] = cm[Temp1];
2201            output_ptr[1] = cm[Temp2];
2202            output_ptr[2] = cm[Temp3];
2203            output_ptr[3] = cm[Temp4];
2204            output_ptr[4] = cm[Temp5];
2205            output_ptr[5] = cm[Temp6];
2206            output_ptr[6] = cm[Temp7];
2207            output_ptr[7] = cm[Temp8];
2208
2209            /* apply filter with vectors pairs */
2210            __asm__ __volatile__ (
2211                "lbu            %[src_ptr_l2],  -24(%[src_ptr])                 \n\t"
2212                "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
2213                "lbu            %[src_ptr_0],   8(%[src_ptr])                   \n\t"
2214                "lbu            %[src_ptr_r1],  24(%[src_ptr])                  \n\t"
2215                "lbu            %[src_ptr_r2],  40(%[src_ptr])                  \n\t"
2216                "lbu            %[src_ptr_r3],  56(%[src_ptr])                  \n\t"
2217                "mtlo           %[vector4a],    $ac2                            \n\t"
2218
2219                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2220                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2221                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2222                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2223                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2224                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2225
2226                "lbu            %[src_ptr_l2],  -23(%[src_ptr])                 \n\t"
2227                "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
2228                "lbu            %[src_ptr_0],   9(%[src_ptr])                   \n\t"
2229                "lbu            %[src_ptr_r1],  25(%[src_ptr])                  \n\t"
2230                "lbu            %[src_ptr_r2],  41(%[src_ptr])                  \n\t"
2231                "lbu            %[src_ptr_r3],  57(%[src_ptr])                  \n\t"
2232                "mtlo           %[vector4a],    $ac0                            \n\t"
2233                "extp           %[Temp1],       $ac2,           9               \n\t"
2234
2235                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2236                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2237                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2238                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2239                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2240                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2241
2242                "lbu            %[src_ptr_l2],  -22(%[src_ptr])                 \n\t"
2243                "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
2244                "lbu            %[src_ptr_0],   10(%[src_ptr])                  \n\t"
2245                "lbu            %[src_ptr_r1],  26(%[src_ptr])                  \n\t"
2246                "lbu            %[src_ptr_r2],  42(%[src_ptr])                  \n\t"
2247                "lbu            %[src_ptr_r3],  58(%[src_ptr])                  \n\t"
2248                "mtlo           %[vector4a],    $ac1                            \n\t"
2249                "extp           %[Temp2],       $ac0,           9               \n\t"
2250
2251                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2252                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2253                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2254                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2255                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2256                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2257
2258                "lbu            %[src_ptr_l2],  -21(%[src_ptr])                 \n\t"
2259                "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
2260                "lbu            %[src_ptr_0],   11(%[src_ptr])                  \n\t"
2261                "lbu            %[src_ptr_r1],  27(%[src_ptr])                  \n\t"
2262                "lbu            %[src_ptr_r2],  43(%[src_ptr])                  \n\t"
2263                "lbu            %[src_ptr_r3],  59(%[src_ptr])                  \n\t"
2264                "mtlo           %[vector4a],    $ac3                            \n\t"
2265                "extp           %[Temp3],       $ac1,           9               \n\t"
2266
2267                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2268                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2269                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2270                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2271                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2272                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2273
2274                "lbu            %[src_ptr_l2],  -20(%[src_ptr])                 \n\t"
2275                "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
2276                "lbu            %[src_ptr_0],   12(%[src_ptr])                  \n\t"
2277                "lbu            %[src_ptr_r1],  28(%[src_ptr])                  \n\t"
2278                "lbu            %[src_ptr_r2],  44(%[src_ptr])                  \n\t"
2279                "lbu            %[src_ptr_r3],  60(%[src_ptr])                  \n\t"
2280                "mtlo           %[vector4a],    $ac2                            \n\t"
2281                "extp           %[Temp4],       $ac3,           9               \n\t"
2282
2283                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2284                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2285                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2286                "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2287                "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2288                "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2289
2290                "lbu            %[src_ptr_l2],  -19(%[src_ptr])                 \n\t"
2291                "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
2292                "lbu            %[src_ptr_0],   13(%[src_ptr])                  \n\t"
2293                "lbu            %[src_ptr_r1],  29(%[src_ptr])                  \n\t"
2294                "lbu            %[src_ptr_r2],  45(%[src_ptr])                  \n\t"
2295                "lbu            %[src_ptr_r3],  61(%[src_ptr])                  \n\t"
2296                "mtlo           %[vector4a],    $ac0                            \n\t"
2297                "extp           %[Temp5],       $ac2,           9               \n\t"
2298
2299                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2300                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2301                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2302                "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2303                "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2304                "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2305
2306                "lbu            %[src_ptr_l2],  -18(%[src_ptr])                 \n\t"
2307                "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
2308                "lbu            %[src_ptr_0],   14(%[src_ptr])                  \n\t"
2309                "lbu            %[src_ptr_r1],  30(%[src_ptr])                  \n\t"
2310                "lbu            %[src_ptr_r2],  46(%[src_ptr])                  \n\t"
2311                "lbu            %[src_ptr_r3],  62(%[src_ptr])                  \n\t"
2312                "mtlo           %[vector4a],    $ac1                            \n\t"
2313                "extp           %[Temp6],       $ac0,           9               \n\t"
2314
2315                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2316                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2317                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2318                "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2319                "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2320                "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2321
2322                "lbu            %[src_ptr_l2],  -17(%[src_ptr])                 \n\t"
2323                "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
2324                "lbu            %[src_ptr_0],   15(%[src_ptr])                  \n\t"
2325                "lbu            %[src_ptr_r1],  31(%[src_ptr])                  \n\t"
2326                "lbu            %[src_ptr_r2],  47(%[src_ptr])                  \n\t"
2327                "lbu            %[src_ptr_r3],  63(%[src_ptr])                  \n\t"
2328                "mtlo           %[vector4a],    $ac3                            \n\t"
2329                "extp           %[Temp7],       $ac1,           9               \n\t"
2330
2331                "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2332                "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2333                "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2334                "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2335                "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2336                "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2337                "extp           %[Temp8],       $ac3,           9               \n\t"
2338
2339                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2340                  [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2341                  [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2342                  [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2343                  [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2344                  [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
2345                  [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
2346                : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
2347                  [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
2348                  [src_ptr] "r" (src_ptr)
2349            );
2350
2351            src_ptr += 16;
2352            output_ptr[8] = cm[Temp1];
2353            output_ptr[9] = cm[Temp2];
2354            output_ptr[10] = cm[Temp3];
2355            output_ptr[11] = cm[Temp4];
2356            output_ptr[12] = cm[Temp5];
2357            output_ptr[13] = cm[Temp6];
2358            output_ptr[14] = cm[Temp7];
2359            output_ptr[15] = cm[Temp8];
2360
2361            output_ptr += output_pitch;
2362        }
2363    }
2364}
2365
2366
2367void vp8_sixtap_predict4x4_dspr2
2368(
2369    unsigned char *RESTRICT src_ptr,
2370    int   src_pixels_per_line,
2371    int  xoffset,
2372    int  yoffset,
2373    unsigned char *RESTRICT dst_ptr,
2374    int dst_pitch
2375)
2376{
2377    unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
2378    unsigned int pos = 16;
2379
2380    /* bit positon for extract from acc */
2381    __asm__ __volatile__ (
2382        "wrdsp      %[pos],     1           \n\t"
2383        :
2384        : [pos] "r" (pos)
2385    );
2386
2387    if (yoffset)
2388    {
2389        /* First filter 1-D horizontally... */
2390        vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
2391                                        src_pixels_per_line, 9, xoffset, 4);
2392        /* then filter verticaly... */
2393        vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
2394    }
2395    else
2396        /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2397        vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line,
2398                                        4, xoffset, dst_pitch);
2399}
2400
2401
2402void vp8_sixtap_predict8x8_dspr2
2403(
2404    unsigned char   *RESTRICT src_ptr,
2405    int  src_pixels_per_line,
2406    int  xoffset,
2407    int  yoffset,
2408    unsigned char *RESTRICT dst_ptr,
2409    int  dst_pitch
2410)
2411{
2412
2413    unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
2414    unsigned int pos, Temp1, Temp2;
2415
2416    pos = 16;
2417
2418    /* bit positon for extract from acc */
2419    __asm__ __volatile__ (
2420        "wrdsp      %[pos],     1               \n\t"
2421        :
2422        : [pos] "r" (pos)
2423    );
2424
2425    if (yoffset)
2426    {
2427
2428        src_ptr = src_ptr - (2 * src_pixels_per_line);
2429
2430        if (xoffset)
2431            /* filter 1-D horizontally... */
2432            vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2433                                                13, xoffset, 8);
2434
2435        else
2436        {
2437            /* prefetch src_ptr data to cache memory */
2438            prefetch_load(src_ptr + 2 * src_pixels_per_line);
2439
2440            __asm__ __volatile__ (
2441                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2442                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2443                "sw     %[Temp1],   0(%[FData])                             \n\t"
2444                "sw     %[Temp2],   4(%[FData])                             \n\t"
2445                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2446
2447                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2448                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2449                "sw     %[Temp1],   8(%[FData])                             \n\t"
2450                "sw     %[Temp2],   12(%[FData])                            \n\t"
2451                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2452
2453                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2454                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2455                "sw     %[Temp1],   16(%[FData])                            \n\t"
2456                "sw     %[Temp2],   20(%[FData])                            \n\t"
2457                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2458
2459                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2460                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2461                "sw     %[Temp1],   24(%[FData])                            \n\t"
2462                "sw     %[Temp2],   28(%[FData])                            \n\t"
2463                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2464
2465                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2466                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2467                "sw     %[Temp1],   32(%[FData])                            \n\t"
2468                "sw     %[Temp2],   36(%[FData])                            \n\t"
2469                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2470
2471                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2472                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2473                "sw     %[Temp1],   40(%[FData])                            \n\t"
2474                "sw     %[Temp2],   44(%[FData])                            \n\t"
2475                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2476
2477                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2478                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2479                "sw     %[Temp1],   48(%[FData])                            \n\t"
2480                "sw     %[Temp2],   52(%[FData])                            \n\t"
2481                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2482
2483                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2484                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2485                "sw     %[Temp1],   56(%[FData])                            \n\t"
2486                "sw     %[Temp2],   60(%[FData])                            \n\t"
2487                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2488
2489                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2490                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2491                "sw     %[Temp1],   64(%[FData])                            \n\t"
2492                "sw     %[Temp2],   68(%[FData])                            \n\t"
2493                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2494
2495                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2496                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2497                "sw     %[Temp1],   72(%[FData])                            \n\t"
2498                "sw     %[Temp2],   76(%[FData])                            \n\t"
2499                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2500
2501                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2502                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2503                "sw     %[Temp1],   80(%[FData])                            \n\t"
2504                "sw     %[Temp2],   84(%[FData])                            \n\t"
2505                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2506
2507                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2508                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2509                "sw     %[Temp1],   88(%[FData])                            \n\t"
2510                "sw     %[Temp2],   92(%[FData])                            \n\t"
2511                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2512
2513                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2514                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2515                "sw     %[Temp1],   96(%[FData])                            \n\t"
2516                "sw     %[Temp2],   100(%[FData])                           \n\t"
2517
2518                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2519                : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
2520                  [src_pixels_per_line] "r" (src_pixels_per_line)
2521            );
2522        }
2523
2524        /* filter verticaly... */
2525        vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset);
2526    }
2527
2528    /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2529    else
2530    {
2531        if (xoffset)
2532            vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2533                                                8, xoffset, dst_pitch);
2534
2535        else
2536        {
2537            /* copy from src buffer to dst buffer */
2538            __asm__ __volatile__ (
2539                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2540                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2541                "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
2542                "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
2543                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2544
2545                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2546                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2547                "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
2548                "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
2549                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2550
2551                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2552                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2553                "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
2554                "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
2555                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2556
2557                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2558                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2559                "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
2560                "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
2561                "addu   %[src_ptr], %[src_ptr],   %[src_pixels_per_line]    \n\t"
2562
2563                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2564                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2565                "sw     %[Temp1],   32(%[dst_ptr])                          \n\t"
2566                "sw     %[Temp2],   36(%[dst_ptr])                          \n\t"
2567                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2568
2569                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2570                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2571                "sw     %[Temp1],   40(%[dst_ptr])                          \n\t"
2572                "sw     %[Temp2],   44(%[dst_ptr])                          \n\t"
2573                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2574
2575                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2576                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2577                "sw     %[Temp1],   48(%[dst_ptr])                          \n\t"
2578                "sw     %[Temp2],   52(%[dst_ptr])                          \n\t"
2579                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2580
2581                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2582                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2583                "sw     %[Temp1],   56(%[dst_ptr])                          \n\t"
2584                "sw     %[Temp2],   60(%[dst_ptr])                          \n\t"
2585
2586                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2587                : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
2588                  [src_pixels_per_line] "r" (src_pixels_per_line)
2589            );
2590        }
2591    }
2592}
2593
2594
2595void vp8_sixtap_predict8x4_dspr2
2596(
2597    unsigned char   *RESTRICT src_ptr,
2598    int  src_pixels_per_line,
2599    int  xoffset,
2600    int  yoffset,
2601    unsigned char *RESTRICT dst_ptr,
2602    int  dst_pitch
2603)
2604{
2605    unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
2606    unsigned int pos, Temp1, Temp2;
2607
2608    pos = 16;
2609
2610    /* bit positon for extract from acc */
2611    __asm__ __volatile__ (
2612        "wrdsp      %[pos],     1           \n\t"
2613        :
2614        : [pos] "r" (pos)
2615    );
2616
2617    if (yoffset)
2618    {
2619
2620        src_ptr = src_ptr - (2 * src_pixels_per_line);
2621
2622        if (xoffset)
2623            /* filter 1-D horizontally... */
2624            vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2625                                                9, xoffset, 8);
2626
2627        else
2628        {
2629            /* prefetch src_ptr data to cache memory */
2630            prefetch_load(src_ptr + 2 * src_pixels_per_line);
2631
2632            __asm__ __volatile__ (
2633                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2634                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2635                "sw     %[Temp1],   0(%[FData])                             \n\t"
2636                "sw     %[Temp2],   4(%[FData])                             \n\t"
2637                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2638
2639                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2640                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2641                "sw     %[Temp1],   8(%[FData])                             \n\t"
2642                "sw     %[Temp2],   12(%[FData])                            \n\t"
2643                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2644
2645                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2646                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2647                "sw     %[Temp1],   16(%[FData])                            \n\t"
2648                "sw     %[Temp2],   20(%[FData])                            \n\t"
2649                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2650
2651                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2652                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2653                "sw     %[Temp1],   24(%[FData])                            \n\t"
2654                "sw     %[Temp2],   28(%[FData])                            \n\t"
2655                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2656
2657                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2658                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2659                "sw     %[Temp1],   32(%[FData])                            \n\t"
2660                "sw     %[Temp2],   36(%[FData])                            \n\t"
2661                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2662
2663                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2664                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2665                "sw     %[Temp1],   40(%[FData])                            \n\t"
2666                "sw     %[Temp2],   44(%[FData])                            \n\t"
2667                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2668
2669                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2670                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2671                "sw     %[Temp1],   48(%[FData])                            \n\t"
2672                "sw     %[Temp2],   52(%[FData])                            \n\t"
2673                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2674
2675                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2676                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2677                "sw     %[Temp1],   56(%[FData])                            \n\t"
2678                "sw     %[Temp2],   60(%[FData])                            \n\t"
2679                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2680
2681                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2682                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2683                "sw     %[Temp1],   64(%[FData])                            \n\t"
2684                "sw     %[Temp2],   68(%[FData])                            \n\t"
2685
2686                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2687                : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
2688                  [src_pixels_per_line] "r" (src_pixels_per_line)
2689            );
2690        }
2691
2692        /* filter verticaly... */
2693        vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset);
2694    }
2695
2696    /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2697    else
2698    {
2699        if (xoffset)
2700            vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2701                                                4, xoffset, dst_pitch);
2702
2703        else
2704        {
2705            /* copy from src buffer to dst buffer */
2706            __asm__ __volatile__ (
2707                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2708                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2709                "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
2710                "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
2711                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2712
2713                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2714                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2715                "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
2716                "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
2717                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2718
2719                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2720                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2721                "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
2722                "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
2723                "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2724
2725                "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2726                "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2727                "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
2728                "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
2729
2730                : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2731                : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
2732                  [src_pixels_per_line] "r" (src_pixels_per_line)
2733            );
2734        }
2735    }
2736}
2737
2738
2739void vp8_sixtap_predict16x16_dspr2
2740(
2741    unsigned char   *RESTRICT src_ptr,
2742    int  src_pixels_per_line,
2743    int  xoffset,
2744    int  yoffset,
2745    unsigned char *RESTRICT dst_ptr,
2746    int  dst_pitch
2747)
2748{
2749    const unsigned short *VFilter;
2750    unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
2751    unsigned int pos;
2752
2753    VFilter = sub_pel_filterss[yoffset];
2754
2755    pos = 16;
2756
2757    /* bit positon for extract from acc */
2758    __asm__ __volatile__ (
2759        "wrdsp      %[pos],     1           \n\t"
2760        :
2761        : [pos] "r" (pos)
2762    );
2763
2764    if (yoffset)
2765    {
2766
2767        src_ptr = src_ptr - (2 * src_pixels_per_line);
2768
2769        switch (xoffset)
2770        {
2771            /* filter 1-D horizontally... */
2772        case 2:
2773        case 4:
2774        case 6:
2775            /* 6 tap filter */
2776            vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line,
2777                                                 21, xoffset, 16);
2778            break;
2779
2780        case 0:
2781            /* only copy buffer */
2782            vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
2783            break;
2784
2785        case 1:
2786        case 3:
2787        case 5:
2788        case 7:
2789            /* 4 tap filter */
2790            vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16,
2791                                                 21, xoffset, yoffset, dst_ptr, dst_pitch);
2792            break;
2793        }
2794
2795        /* filter verticaly... */
2796        vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
2797    }
2798    else
2799    {
2800        /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2801        switch (xoffset)
2802        {
2803        case 2:
2804        case 4:
2805        case 6:
2806            /* 6 tap filter */
2807            vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line,
2808                                                 16, xoffset, dst_pitch);
2809            break;
2810
2811        case 1:
2812        case 3:
2813        case 5:
2814        case 7:
2815            /* 4 tap filter */
2816            vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16,
2817                                                 21, xoffset, yoffset, dst_ptr, dst_pitch);
2818            break;
2819        }
2820    }
2821}
2822
2823#endif
2824