15ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang/*
25ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
35ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *
45ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  Use of this source code is governed by a BSD-style license
55ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  that can be found in the LICENSE file in the root of the source
65ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  tree. An additional intellectual property rights grant can be found
75ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  in the file PATENTS.  All contributing project authors may
85ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang *  be found in the AUTHORS file in the root of the source tree.
95ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang */
105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <assert.h>
125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include <stdio.h>
135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
14da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
15da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/mips/convolve_common_dspr2.h"
16da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/vpx_convolve.h"
17da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian#include "vpx_dsp/vpx_dsp_common.h"
185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#include "vpx_ports/mem.h"
195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#if HAVE_DSPR2
217bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
227bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                      uint8_t *dst, int32_t dst_stride,
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                      const int16_t *filter_y, int32_t w,
245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                      int32_t h) {
257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32_t x, y;
265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src_ptr;
277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t *dst_ptr;
287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t *cm = vpx_ff_cropTbl;
297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t vector4a = 64;
307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t load1, load2, load3, load4;
317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t p1, p2;
327bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t n1, n2;
337bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t scratch1, scratch2;
347bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t store1, store2;
357bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32_t vector1b, vector2b, vector3b, vector4b;
367bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32_t Temp1, Temp2;
375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_y)[0];
395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_y)[1];
405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_y)[2];
415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_y)[3];
425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  src -= 3 * src_stride;
445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
47da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst + dst_stride);
485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < w; x += 4) {
505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src_ptr = src + x;
515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst_ptr = dst + x;
525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
537bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __asm__ __volatile__(
545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac0                            \n\t"
635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac1                            \n\t"
645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac2                            \n\t"
655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac3                            \n\t"
665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac0                            \n\t"
675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                            \n\t"
685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                            \n\t"
695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                            \n\t"
705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
1015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
1035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
1055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
1065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
1075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
1095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
1105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
1115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
1125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
1135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
1145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
1155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
1165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
1185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
1195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac0,           31              \n\t"
1205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
1215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
1225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac1,           31              \n\t"
1235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
1255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
1265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
1275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
1285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
1295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
1305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
1315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
1325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
1335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
1345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
1365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
1375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
1385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
1395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac2,           31              \n\t"
1405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
1425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
1435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
1445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
1455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac3,           31              \n\t"
1465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
1475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
1495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
1505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
1515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
1535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
1545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
1555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
1565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
1585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
1595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1607bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
1617bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
1627bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
1637bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
1647bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
1657bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
1667bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1677bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
1687bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
1697bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
1705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
1715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
1735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
1745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
1755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
1765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
1775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1787bc9febe8749e98a3812a0dc4380ceae75c29450Johannstatic void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
1797bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       uint8_t *dst, int32_t dst_stride,
1807bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                       const int16_t *filter_y, int32_t h) {
1817bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32_t x, y;
1825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  const uint8_t *src_ptr;
1837bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t *dst_ptr;
1847bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t *cm = vpx_ff_cropTbl;
1857bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t vector4a = 64;
1867bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t load1, load2, load3, load4;
1877bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t p1, p2;
1887bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t n1, n2;
1897bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t scratch1, scratch2;
1907bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t store1, store2;
1917bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32_t vector1b, vector2b, vector3b, vector4b;
1927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  int32_t Temp1, Temp2;
1935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector1b = ((const int32_t *)filter_y)[0];
1955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector2b = ((const int32_t *)filter_y)[1];
1965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector3b = ((const int32_t *)filter_y)[2];
1975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  vector4b = ((const int32_t *)filter_y)[3];
1985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
1995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  src -= 3 * src_stride;
2005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  for (y = h; y--;) {
2025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* prefetch data to cache memory */
203da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst + dst_stride);
204da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst + dst_stride + 32);
2055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    for (x = 0; x < 64; x += 4) {
2075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      src_ptr = src + x;
2085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      dst_ptr = dst + x;
2095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2107bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __asm__ __volatile__(
2115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
2125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
2145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
2165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
2185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac0                            \n\t"
2205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac1                            \n\t"
2215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac2                            \n\t"
2225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mtlo             %[vector4a],  $ac3                            \n\t"
2235ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac0                            \n\t"
2245ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac1                            \n\t"
2255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac2                            \n\t"
2265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "mthi             $zero,        $ac3                            \n\t"
2275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
2295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
2305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
2315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
2325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
2335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
2345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
2355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
2365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
2385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
2395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
2405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
2415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
2435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
2445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
2455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
2465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
2475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
2485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
2495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
2505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
2525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
2535ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
2545ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
2555ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2565ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
2585ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
2605ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
2625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
2635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
2645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
2665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
2675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
2685ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
2695ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
2705ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
2715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
2725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
2735ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
2755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
2765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac0,           31              \n\t"
2775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
2785ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
2795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac1,           31              \n\t"
2805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2815ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
2825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
2835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
2845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
2855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
2865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
2875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
2885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
2895ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
2905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
2915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2925ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
2935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
2945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
2955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
2965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp1],     $ac2,           31              \n\t"
2975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
2985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
2995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
3005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
3015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
3025ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "extp             %[Temp2],     $ac3,           31              \n\t"
3035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
3045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
3065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
3075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
3085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
3105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
3115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
3125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
3135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
3155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
3165ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3177bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
3187bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
3197bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
3207bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
3217bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
3227bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
3237bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
3247bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
3257bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
3267bc9febe8749e98a3812a0dc4380ceae75c29450Johann            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
3275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
3285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    /* Next row... */
3305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    src += src_stride;
3315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    dst += dst_stride;
3325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
3345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
335da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
3365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                                  uint8_t *dst, ptrdiff_t dst_stride,
337df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                  const InterpKernel *filter, int x0_q4,
338df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
339df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                  int w, int h) {
340df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  const int16_t *const filter_y = filter[y0_q4];
341da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(y_step_q4 == 16);
342da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(((const int32_t *)filter_y)[1] != 0x800000);
343da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
344da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  if (((const int32_t *)filter_y)[0] == 0) {
345df37111358d02836cb29bbcb9c6e4c95dff90a16Johann    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
346df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
3475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  } else {
348da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    uint32_t pos = 38;
349da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
350da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    /* bit positon for extract from acc */
3517bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
3527bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         :
3537bc9febe8749e98a3812a0dc4380ceae75c29450Johann                         : [pos] "r"(pos));
354da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
355da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    prefetch_store(dst);
356da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian
357da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian    switch (w) {
358da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 4:
359da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 8:
360da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 16:
361da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 32:
3627bc9febe8749e98a3812a0dc4380ceae75c29450Johann        convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
3637bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                  h);
364da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
365da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      case 64:
366da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_store(dst + 32);
3677bc9febe8749e98a3812a0dc4380ceae75c29450Johann        convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
3687bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                   h);
369da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
370da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      default:
371df37111358d02836cb29bbcb9c6e4c95dff90a16Johann        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
372df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
373da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        break;
3745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    }
3755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
3765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
3775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
378da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
3795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                             uint8_t *dst, ptrdiff_t dst_stride,
380df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                             const InterpKernel *filter, int x0_q4,
381df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
3827bc9febe8749e98a3812a0dc4380ceae75c29450Johann                             int h) {
3835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* Fixed size intermediate buffer places limits on parameters. */
384da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
3855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
3865ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  assert(w <= 64);
3885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  assert(h <= 64);
389da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(x_step_q4 == 16);
390da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  assert(y_step_q4 == 16);
3915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
3927bc9febe8749e98a3812a0dc4380ceae75c29450Johann  if (intermediate_height < h) intermediate_height = h;
3935ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
394df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,
395df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,
396df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                      intermediate_height);
3975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
398df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,
399df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                         x_step_q4, y0_q4, y_step_q4, w, h);
4005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
4015ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
402da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanianvoid vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
4035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang                            uint8_t *dst, ptrdiff_t dst_stride,
404df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                            const InterpKernel *filter, int x0_q4,
405df37111358d02836cb29bbcb9c6e4c95dff90a16Johann                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
4067bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            int h) {
4075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  int x, y;
4087bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
409df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)filter;
410df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)x0_q4;
411df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)x_step_q4;
412df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)y0_q4;
413df37111358d02836cb29bbcb9c6e4c95dff90a16Johann  (void)y_step_q4;
4145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  /* prefetch data to cache memory */
416da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(src);
417da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_load(src + 32);
418da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian  prefetch_store(dst);
4195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  switch (w) {
4215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 4:
4225ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 1 word storage */
4237bc9febe8749e98a3812a0dc4380ceae75c29450Johann      for (y = h; y--;) {
424da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
425da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
426da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
4275ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4287bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
4295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
4305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
4317bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
4327bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
4335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4347bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
4357bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [src] "r"(src), [dst] "r"(dst));
4365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
4385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
4395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
4415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 8:
4425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 2 word storage */
4437bc9febe8749e98a3812a0dc4380ceae75c29450Johann      for (y = h; y--;) {
444da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
445da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
446da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
4475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4487bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
4495ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
4505ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
4515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         4(%[src])      \n\t"
4525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         4(%[dst])      \n\t"
4537bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
4547bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
4557bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
4567bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
4575ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4587bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
4597bc9febe8749e98a3812a0dc4380ceae75c29450Johann              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
4607bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [src] "r"(src), [dst] "r"(dst));
4615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
4635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
4645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4655ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
4665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 16:
4675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 4 word storage */
4687bc9febe8749e98a3812a0dc4380ceae75c29450Johann      for (y = h; y--;) {
469da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
470da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
471da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
4725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4737bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
4745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
4755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
4765ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         4(%[src])      \n\t"
4775ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         4(%[dst])      \n\t"
4787bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
4795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         8(%[src])      \n\t"
4805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         8(%[dst])      \n\t"
4817bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
4827bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
4837bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
4845ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         12(%[src])     \n\t"
4855ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[dst])     \n\t"
4867bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
4877bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
4887bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
4897bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
4905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4917bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
4927bc9febe8749e98a3812a0dc4380ceae75c29450Johann              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
4937bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [src] "r"(src), [dst] "r"(dst));
4945ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
4955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
4965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
4975ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
4985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
4995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 32:
5005ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 8 word storage */
5017bc9febe8749e98a3812a0dc4380ceae75c29450Johann      for (y = h; y--;) {
502da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
503da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
504da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
5055ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5067bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
5075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
5085ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
5095ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         4(%[src])      \n\t"
5105ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         4(%[dst])      \n\t"
5117bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         8(%[src])      \n\t"
5135ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         8(%[dst])      \n\t"
5147bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
5157bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
5167bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
5175ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         12(%[src])     \n\t"
5185ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[dst])     \n\t"
5197bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         16(%[src])     \n\t"
5215ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         16(%[dst])     \n\t"
5227bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
5237bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
5247bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
5255ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         20(%[src])     \n\t"
5265ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         20(%[dst])     \n\t"
5277bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5285ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         24(%[src])     \n\t"
5295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         24(%[dst])     \n\t"
5307bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         16(%[dst])     \n\t" /* store */
5317bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
5327bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         20(%[dst])     \n\t" /* store */
5335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         28(%[src])     \n\t"
5345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         28(%[dst])     \n\t"
5357bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5367bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         24(%[dst])     \n\t" /* store */
5377bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
5387bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
5395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5407bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
5417bc9febe8749e98a3812a0dc4380ceae75c29450Johann              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
5427bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [src] "r"(src), [dst] "r"(dst));
5435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
5455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
5465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
5475ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
5485ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    case 64:
549da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      prefetch_load(src + 64);
550da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian      prefetch_store(dst + 32);
5515ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5525ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      /* 16 word storage */
5537bc9febe8749e98a3812a0dc4380ceae75c29450Johann      for (y = h; y--;) {
554da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride);
555da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 32);
556da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_load(src + src_stride + 64);
557da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride);
558da49e34c1fb5e99681f4ad99c21d9cfd83eddb96Vignesh Venkatasubramanian        prefetch_store(dst + dst_stride + 32);
5595ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
5607bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
5615ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         0(%[src])      \n\t"
5625ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         0(%[dst])      \n\t"
5635ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         4(%[src])      \n\t"
5645ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         4(%[dst])      \n\t"
5657bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5665ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         8(%[src])      \n\t"
5675ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         8(%[dst])      \n\t"
5687bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         0(%[dst])      \n\t" /* store */
5697bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
5707bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         4(%[dst])      \n\t" /* store */
5715ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         12(%[src])     \n\t"
5725ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         12(%[dst])     \n\t"
5737bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5745ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         16(%[src])     \n\t"
5755ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         16(%[dst])     \n\t"
5767bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         8(%[dst])      \n\t" /* store */
5777bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
5787bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         12(%[dst])     \n\t" /* store */
5795ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         20(%[src])     \n\t"
5805ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         20(%[dst])     \n\t"
5817bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5825ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         24(%[src])     \n\t"
5835ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         24(%[dst])     \n\t"
5847bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         16(%[dst])     \n\t" /* store */
5857bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
5867bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         20(%[dst])     \n\t" /* store */
5875ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         28(%[src])     \n\t"
5885ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         28(%[dst])     \n\t"
5897bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5905ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         32(%[src])     \n\t"
5915ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         32(%[dst])     \n\t"
5927bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         24(%[dst])     \n\t" /* store */
5937bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
5947bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         28(%[dst])     \n\t" /* store */
5955ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         36(%[src])     \n\t"
5965ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         36(%[dst])     \n\t"
5977bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
5985ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         40(%[src])     \n\t"
5995ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         40(%[dst])     \n\t"
6007bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         32(%[dst])     \n\t" /* store */
6017bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
6027bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         36(%[dst])     \n\t" /* store */
6035ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         44(%[src])     \n\t"
6045ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         44(%[dst])     \n\t"
6057bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
6065ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         48(%[src])     \n\t"
6075ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         48(%[dst])     \n\t"
6087bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         40(%[dst])     \n\t" /* store */
6097bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
6107bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         44(%[dst])     \n\t" /* store */
6115ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         52(%[src])     \n\t"
6125ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         52(%[dst])     \n\t"
6137bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
6145ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp1],         56(%[src])     \n\t"
6155ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp2],         56(%[dst])     \n\t"
6167bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         48(%[dst])     \n\t" /* store */
6177bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
6187bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         52(%[dst])     \n\t" /* store */
6195ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp3],         60(%[src])     \n\t"
6205ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang            "ulw              %[tp4],         60(%[dst])     \n\t"
6217bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
6227bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn1],         56(%[dst])     \n\t" /* store */
6237bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
6247bc9febe8749e98a3812a0dc4380ceae75c29450Johann            "sw               %[tn2],         60(%[dst])     \n\t" /* store */
6257bc9febe8749e98a3812a0dc4380ceae75c29450Johann
6267bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
6277bc9febe8749e98a3812a0dc4380ceae75c29450Johann              [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
6287bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [src] "r"(src), [dst] "r"(dst));
6295ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6305ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
6315ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
6325ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
6335ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
6345ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang    default:
6355ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      for (y = h; y > 0; --y) {
6365ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        for (x = 0; x < w; ++x) {
6375ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang          dst[x] = (dst[x] + src[x] + 1) >> 1;
6385ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        }
6395ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang
6405ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        src += src_stride;
6415ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang        dst += dst_stride;
6425ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      }
6435ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang      break;
6445ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang  }
6455ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang}
6465ae7ac49f08a179e4f054d99fcfc9dce78d26e58hkuang#endif
647