1233d2500723e5594f3e7c70896ffeeef32b9c950ywan/* 2233d2500723e5594f3e7c70896ffeeef32b9c950ywan * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3233d2500723e5594f3e7c70896ffeeef32b9c950ywan * 4233d2500723e5594f3e7c70896ffeeef32b9c950ywan * Use of this source code is governed by a BSD-style license 5233d2500723e5594f3e7c70896ffeeef32b9c950ywan * that can be found in the LICENSE file in the root of the source 6233d2500723e5594f3e7c70896ffeeef32b9c950ywan * tree. An additional intellectual property rights grant can be found 7233d2500723e5594f3e7c70896ffeeef32b9c950ywan * in the file PATENTS. All contributing project authors may 8233d2500723e5594f3e7c70896ffeeef32b9c950ywan * be found in the AUTHORS file in the root of the source tree. 9233d2500723e5594f3e7c70896ffeeef32b9c950ywan */ 10233d2500723e5594f3e7c70896ffeeef32b9c950ywan 11233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <assert.h> 12233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include <stdio.h> 13233d2500723e5594f3e7c70896ffeeef32b9c950ywan 14233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "./vpx_config.h" 15233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "./vp9_rtcd.h" 16233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_common.h" 17233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx/vpx_integer.h" 18233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vpx_ports/mem.h" 19233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/vp9_convolve.h" 20233d2500723e5594f3e7c70896ffeeef32b9c950ywan#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 21233d2500723e5594f3e7c70896ffeeef32b9c950ywan 22233d2500723e5594f3e7c70896ffeeef32b9c950ywan#if HAVE_DSPR2 23233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void convolve_vert_4_dspr2(const uint8_t *src, 24233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t src_stride, 25233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint8_t *dst, 26233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t dst_stride, 27233d2500723e5594f3e7c70896ffeeef32b9c950ywan const int16_t *filter_y, 28233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t w, 29233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t h) { 30233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t x, y; 31233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *src_ptr; 32233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint8_t *dst_ptr; 33233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint8_t *cm = vp9_ff_cropTbl; 34233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t vector4a = 64; 35233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t load1, load2, load3, load4; 36233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t p1, p2; 37233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t n1, n2; 38233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t scratch1, scratch2; 39233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t store1, store2; 40233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t vector1b, vector2b, vector3b, vector4b; 41233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t Temp1, Temp2; 42233d2500723e5594f3e7c70896ffeeef32b9c950ywan 43233d2500723e5594f3e7c70896ffeeef32b9c950ywan vector1b = ((const int32_t *)filter_y)[0]; 44233d2500723e5594f3e7c70896ffeeef32b9c950ywan vector2b = ((const int32_t *)filter_y)[1]; 45233d2500723e5594f3e7c70896ffeeef32b9c950ywan vector3b = ((const int32_t *)filter_y)[2]; 46233d2500723e5594f3e7c70896ffeeef32b9c950ywan vector4b = ((const int32_t *)filter_y)[3]; 47233d2500723e5594f3e7c70896ffeeef32b9c950ywan 48233d2500723e5594f3e7c70896ffeeef32b9c950ywan src -= 3 * src_stride; 49233d2500723e5594f3e7c70896ffeeef32b9c950ywan 50233d2500723e5594f3e7c70896ffeeef32b9c950ywan for (y = h; y--;) { 51233d2500723e5594f3e7c70896ffeeef32b9c950ywan /* prefetch data to cache memory */ 52233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_prefetch_store(dst + dst_stride); 53233d2500723e5594f3e7c70896ffeeef32b9c950ywan 54233d2500723e5594f3e7c70896ffeeef32b9c950ywan for (x = 0; x < w; x += 4) { 55233d2500723e5594f3e7c70896ffeeef32b9c950ywan src_ptr = src + x; 56233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst_ptr = dst + x; 57233d2500723e5594f3e7c70896ffeeef32b9c950ywan 58233d2500723e5594f3e7c70896ffeeef32b9c950ywan __asm__ __volatile__ ( 59233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load1], 0(%[src_ptr]) \n\t" 60233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 61233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load2], 0(%[src_ptr]) \n\t" 62233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 63233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load3], 0(%[src_ptr]) \n\t" 64233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 65233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load4], 0(%[src_ptr]) \n\t" 66233d2500723e5594f3e7c70896ffeeef32b9c950ywan 67233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mtlo %[vector4a], $ac0 \n\t" 68233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mtlo %[vector4a], $ac1 \n\t" 69233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mtlo %[vector4a], $ac2 \n\t" 70233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mtlo %[vector4a], $ac3 \n\t" 71233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mthi $zero, $ac0 \n\t" 72233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mthi $zero, $ac1 \n\t" 73233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mthi $zero, $ac2 \n\t" 74233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mthi $zero, $ac3 \n\t" 75233d2500723e5594f3e7c70896ffeeef32b9c950ywan 76233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[scratch1], %[load1] \n\t" 77233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[p1], %[load2] \n\t" 78233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 79233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 80233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[scratch2], %[load3] \n\t" 81233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[p2], %[load4] \n\t" 82233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 83233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 84233d2500723e5594f3e7c70896ffeeef32b9c950ywan 85233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 86233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 87233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 88233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 89233d2500723e5594f3e7c70896ffeeef32b9c950ywan 90233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[scratch1], %[load1] \n\t" 91233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[p1], %[load2] \n\t" 92233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 93233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 94233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[scratch2], %[load3] \n\t" 95233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[p2], %[load4] \n\t" 96233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 97233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 98233d2500723e5594f3e7c70896ffeeef32b9c950ywan 99233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 100233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 101233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 102233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 103233d2500723e5594f3e7c70896ffeeef32b9c950ywan 104233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 105233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load1], 0(%[src_ptr]) \n\t" 106233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 107233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load2], 0(%[src_ptr]) \n\t" 108233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 109233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load3], 0(%[src_ptr]) \n\t" 110233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 111233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load4], 0(%[src_ptr]) \n\t" 112233d2500723e5594f3e7c70896ffeeef32b9c950ywan 113233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[scratch1], %[load1] \n\t" 114233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[p1], %[load2] \n\t" 115233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 116233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 117233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[scratch2], %[load3] \n\t" 118233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[p2], %[load4] \n\t" 119233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 120233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 121233d2500723e5594f3e7c70896ffeeef32b9c950ywan 122233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 123233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 124233d2500723e5594f3e7c70896ffeeef32b9c950ywan "extp %[Temp1], $ac0, 31 \n\t" 125233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 126233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 127233d2500723e5594f3e7c70896ffeeef32b9c950ywan "extp %[Temp2], $ac1, 31 \n\t" 128233d2500723e5594f3e7c70896ffeeef32b9c950ywan 129233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[scratch1], %[load1] \n\t" 130233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[p1], %[load2] \n\t" 131233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 132233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 133233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[scratch2], %[load3] \n\t" 134233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[p2], %[load4] \n\t" 135233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 136233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 137233d2500723e5594f3e7c70896ffeeef32b9c950ywan 138233d2500723e5594f3e7c70896ffeeef32b9c950ywan "lbux %[store1], %[Temp1](%[cm]) \n\t" 139233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 140233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 141233d2500723e5594f3e7c70896ffeeef32b9c950ywan "extp %[Temp1], $ac2, 31 \n\t" 142233d2500723e5594f3e7c70896ffeeef32b9c950ywan 143233d2500723e5594f3e7c70896ffeeef32b9c950ywan "lbux %[store2], %[Temp2](%[cm]) \n\t" 144233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 145233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 146233d2500723e5594f3e7c70896ffeeef32b9c950ywan "extp %[Temp2], $ac3, 31 \n\t" 147233d2500723e5594f3e7c70896ffeeef32b9c950ywan 148233d2500723e5594f3e7c70896ffeeef32b9c950ywan "sb %[store1], 0(%[dst_ptr]) \n\t" 149233d2500723e5594f3e7c70896ffeeef32b9c950ywan "sb %[store2], 1(%[dst_ptr]) \n\t" 150233d2500723e5594f3e7c70896ffeeef32b9c950ywan 151233d2500723e5594f3e7c70896ffeeef32b9c950ywan "lbux %[store1], %[Temp1](%[cm]) \n\t" 152233d2500723e5594f3e7c70896ffeeef32b9c950ywan "lbux %[store2], %[Temp2](%[cm]) \n\t" 153233d2500723e5594f3e7c70896ffeeef32b9c950ywan 154233d2500723e5594f3e7c70896ffeeef32b9c950ywan "sb %[store1], 2(%[dst_ptr]) \n\t" 155233d2500723e5594f3e7c70896ffeeef32b9c950ywan "sb %[store2], 3(%[dst_ptr]) \n\t" 156233d2500723e5594f3e7c70896ffeeef32b9c950ywan 157233d2500723e5594f3e7c70896ffeeef32b9c950ywan : [load1] "=&r" (load1), [load2] "=&r" (load2), 158233d2500723e5594f3e7c70896ffeeef32b9c950ywan [load3] "=&r" (load3), [load4] "=&r" (load4), 159233d2500723e5594f3e7c70896ffeeef32b9c950ywan [p1] "=&r" (p1), [p2] "=&r" (p2), 160233d2500723e5594f3e7c70896ffeeef32b9c950ywan [n1] "=&r" (n1), [n2] "=&r" (n2), 161233d2500723e5594f3e7c70896ffeeef32b9c950ywan [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), 162233d2500723e5594f3e7c70896ffeeef32b9c950ywan [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 163233d2500723e5594f3e7c70896ffeeef32b9c950ywan [store1] "=&r" (store1), [store2] "=&r" (store2), 164233d2500723e5594f3e7c70896ffeeef32b9c950ywan [src_ptr] "+r" (src_ptr) 165233d2500723e5594f3e7c70896ffeeef32b9c950ywan : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 166233d2500723e5594f3e7c70896ffeeef32b9c950ywan [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 167233d2500723e5594f3e7c70896ffeeef32b9c950ywan [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), 168233d2500723e5594f3e7c70896ffeeef32b9c950ywan [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 169233d2500723e5594f3e7c70896ffeeef32b9c950ywan ); 170233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 171233d2500723e5594f3e7c70896ffeeef32b9c950ywan 172233d2500723e5594f3e7c70896ffeeef32b9c950ywan /* Next row... */ 173233d2500723e5594f3e7c70896ffeeef32b9c950ywan src += src_stride; 174233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst += dst_stride; 175233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 176233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 177233d2500723e5594f3e7c70896ffeeef32b9c950ywan 178233d2500723e5594f3e7c70896ffeeef32b9c950ywanstatic void convolve_vert_64_dspr2(const uint8_t *src, 179233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t src_stride, 180233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint8_t *dst, 181233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t dst_stride, 182233d2500723e5594f3e7c70896ffeeef32b9c950ywan const int16_t *filter_y, 183233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t h) { 184233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t x, y; 185233d2500723e5594f3e7c70896ffeeef32b9c950ywan const uint8_t *src_ptr; 186233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint8_t *dst_ptr; 187233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint8_t *cm = vp9_ff_cropTbl; 188233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t vector4a = 64; 189233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t load1, load2, load3, load4; 190233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t p1, p2; 191233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t n1, n2; 192233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t scratch1, scratch2; 193233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t store1, store2; 194233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t vector1b, vector2b, vector3b, vector4b; 195233d2500723e5594f3e7c70896ffeeef32b9c950ywan int32_t Temp1, Temp2; 196233d2500723e5594f3e7c70896ffeeef32b9c950ywan 197233d2500723e5594f3e7c70896ffeeef32b9c950ywan vector1b = ((const int32_t *)filter_y)[0]; 198233d2500723e5594f3e7c70896ffeeef32b9c950ywan vector2b = ((const int32_t *)filter_y)[1]; 199233d2500723e5594f3e7c70896ffeeef32b9c950ywan vector3b = ((const int32_t *)filter_y)[2]; 200233d2500723e5594f3e7c70896ffeeef32b9c950ywan vector4b = ((const int32_t *)filter_y)[3]; 201233d2500723e5594f3e7c70896ffeeef32b9c950ywan 202233d2500723e5594f3e7c70896ffeeef32b9c950ywan src -= 3 * src_stride; 203233d2500723e5594f3e7c70896ffeeef32b9c950ywan 204233d2500723e5594f3e7c70896ffeeef32b9c950ywan for (y = h; y--;) { 205233d2500723e5594f3e7c70896ffeeef32b9c950ywan /* prefetch data to cache memory */ 206233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_prefetch_store(dst + dst_stride); 207233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_prefetch_store(dst + dst_stride + 32); 208233d2500723e5594f3e7c70896ffeeef32b9c950ywan 209233d2500723e5594f3e7c70896ffeeef32b9c950ywan for (x = 0; x < 64; x += 4) { 210233d2500723e5594f3e7c70896ffeeef32b9c950ywan src_ptr = src + x; 211233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst_ptr = dst + x; 212233d2500723e5594f3e7c70896ffeeef32b9c950ywan 213233d2500723e5594f3e7c70896ffeeef32b9c950ywan __asm__ __volatile__ ( 214233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load1], 0(%[src_ptr]) \n\t" 215233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 216233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load2], 0(%[src_ptr]) \n\t" 217233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 218233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load3], 0(%[src_ptr]) \n\t" 219233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 220233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load4], 0(%[src_ptr]) \n\t" 221233d2500723e5594f3e7c70896ffeeef32b9c950ywan 222233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mtlo %[vector4a], $ac0 \n\t" 223233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mtlo %[vector4a], $ac1 \n\t" 224233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mtlo %[vector4a], $ac2 \n\t" 225233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mtlo %[vector4a], $ac3 \n\t" 226233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mthi $zero, $ac0 \n\t" 227233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mthi $zero, $ac1 \n\t" 228233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mthi $zero, $ac2 \n\t" 229233d2500723e5594f3e7c70896ffeeef32b9c950ywan "mthi $zero, $ac3 \n\t" 230233d2500723e5594f3e7c70896ffeeef32b9c950ywan 231233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[scratch1], %[load1] \n\t" 232233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[p1], %[load2] \n\t" 233233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 234233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 235233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[scratch2], %[load3] \n\t" 236233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[p2], %[load4] \n\t" 237233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 238233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 239233d2500723e5594f3e7c70896ffeeef32b9c950ywan 240233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 241233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 242233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 243233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 244233d2500723e5594f3e7c70896ffeeef32b9c950ywan 245233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[scratch1], %[load1] \n\t" 246233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[p1], %[load2] \n\t" 247233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 248233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 249233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[scratch2], %[load3] \n\t" 250233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[p2], %[load4] \n\t" 251233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 252233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 253233d2500723e5594f3e7c70896ffeeef32b9c950ywan 254233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 255233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 256233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 257233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 258233d2500723e5594f3e7c70896ffeeef32b9c950ywan 259233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 260233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load1], 0(%[src_ptr]) \n\t" 261233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 262233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load2], 0(%[src_ptr]) \n\t" 263233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 264233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load3], 0(%[src_ptr]) \n\t" 265233d2500723e5594f3e7c70896ffeeef32b9c950ywan "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 266233d2500723e5594f3e7c70896ffeeef32b9c950ywan "ulw %[load4], 0(%[src_ptr]) \n\t" 267233d2500723e5594f3e7c70896ffeeef32b9c950ywan 268233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[scratch1], %[load1] \n\t" 269233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[p1], %[load2] \n\t" 270233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 271233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 272233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[scratch2], %[load3] \n\t" 273233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbr %[p2], %[load4] \n\t" 274233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 275233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 276233d2500723e5594f3e7c70896ffeeef32b9c950ywan 277233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 278233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 279233d2500723e5594f3e7c70896ffeeef32b9c950ywan "extp %[Temp1], $ac0, 31 \n\t" 280233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 281233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 282233d2500723e5594f3e7c70896ffeeef32b9c950ywan "extp %[Temp2], $ac1, 31 \n\t" 283233d2500723e5594f3e7c70896ffeeef32b9c950ywan 284233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[scratch1], %[load1] \n\t" 285233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[p1], %[load2] \n\t" 286233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 287233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 288233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[scratch2], %[load3] \n\t" 289233d2500723e5594f3e7c70896ffeeef32b9c950ywan "preceu.ph.qbl %[p2], %[load4] \n\t" 290233d2500723e5594f3e7c70896ffeeef32b9c950ywan "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 291233d2500723e5594f3e7c70896ffeeef32b9c950ywan "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 292233d2500723e5594f3e7c70896ffeeef32b9c950ywan 293233d2500723e5594f3e7c70896ffeeef32b9c950ywan "lbux %[store1], %[Temp1](%[cm]) \n\t" 294233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 295233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 296233d2500723e5594f3e7c70896ffeeef32b9c950ywan "extp %[Temp1], $ac2, 31 \n\t" 297233d2500723e5594f3e7c70896ffeeef32b9c950ywan 298233d2500723e5594f3e7c70896ffeeef32b9c950ywan "lbux %[store2], %[Temp2](%[cm]) \n\t" 299233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 300233d2500723e5594f3e7c70896ffeeef32b9c950ywan "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 301233d2500723e5594f3e7c70896ffeeef32b9c950ywan "extp %[Temp2], $ac3, 31 \n\t" 302233d2500723e5594f3e7c70896ffeeef32b9c950ywan 303233d2500723e5594f3e7c70896ffeeef32b9c950ywan "sb %[store1], 0(%[dst_ptr]) \n\t" 304233d2500723e5594f3e7c70896ffeeef32b9c950ywan "sb %[store2], 1(%[dst_ptr]) \n\t" 305233d2500723e5594f3e7c70896ffeeef32b9c950ywan 306233d2500723e5594f3e7c70896ffeeef32b9c950ywan "lbux %[store1], %[Temp1](%[cm]) \n\t" 307233d2500723e5594f3e7c70896ffeeef32b9c950ywan "lbux %[store2], %[Temp2](%[cm]) \n\t" 308233d2500723e5594f3e7c70896ffeeef32b9c950ywan 309233d2500723e5594f3e7c70896ffeeef32b9c950ywan "sb %[store1], 2(%[dst_ptr]) \n\t" 310233d2500723e5594f3e7c70896ffeeef32b9c950ywan "sb %[store2], 3(%[dst_ptr]) \n\t" 311233d2500723e5594f3e7c70896ffeeef32b9c950ywan 312233d2500723e5594f3e7c70896ffeeef32b9c950ywan : [load1] "=&r" (load1), [load2] "=&r" (load2), 313233d2500723e5594f3e7c70896ffeeef32b9c950ywan [load3] "=&r" (load3), [load4] "=&r" (load4), 314233d2500723e5594f3e7c70896ffeeef32b9c950ywan [p1] "=&r" (p1), [p2] "=&r" (p2), 315233d2500723e5594f3e7c70896ffeeef32b9c950ywan [n1] "=&r" (n1), [n2] "=&r" (n2), 316233d2500723e5594f3e7c70896ffeeef32b9c950ywan [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), 317233d2500723e5594f3e7c70896ffeeef32b9c950ywan [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 318233d2500723e5594f3e7c70896ffeeef32b9c950ywan [store1] "=&r" (store1), [store2] "=&r" (store2), 319233d2500723e5594f3e7c70896ffeeef32b9c950ywan [src_ptr] "+r" (src_ptr) 320233d2500723e5594f3e7c70896ffeeef32b9c950ywan : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 321233d2500723e5594f3e7c70896ffeeef32b9c950ywan [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 322233d2500723e5594f3e7c70896ffeeef32b9c950ywan [vector4a] "r" (vector4a), [src_stride] "r" (src_stride), 323233d2500723e5594f3e7c70896ffeeef32b9c950ywan [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 324233d2500723e5594f3e7c70896ffeeef32b9c950ywan ); 325233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 326233d2500723e5594f3e7c70896ffeeef32b9c950ywan 327233d2500723e5594f3e7c70896ffeeef32b9c950ywan /* Next row... */ 328233d2500723e5594f3e7c70896ffeeef32b9c950ywan src += src_stride; 329233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst += dst_stride; 330233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 331233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 332233d2500723e5594f3e7c70896ffeeef32b9c950ywan 333233d2500723e5594f3e7c70896ffeeef32b9c950ywanvoid vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, 334233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint8_t *dst, ptrdiff_t dst_stride, 335233d2500723e5594f3e7c70896ffeeef32b9c950ywan const int16_t *filter_x, int x_step_q4, 336233d2500723e5594f3e7c70896ffeeef32b9c950ywan const int16_t *filter_y, int y_step_q4, 337233d2500723e5594f3e7c70896ffeeef32b9c950ywan int w, int h) { 338233d2500723e5594f3e7c70896ffeeef32b9c950ywan if (((const int32_t *)filter_y)[1] == 0x800000) { 339233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_convolve_copy(src, src_stride, 340233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst, dst_stride, 341233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_x, x_step_q4, 342233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_y, y_step_q4, 343233d2500723e5594f3e7c70896ffeeef32b9c950ywan w, h); 344233d2500723e5594f3e7c70896ffeeef32b9c950ywan } else if (((const int32_t *)filter_y)[0] == 0) { 345233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_convolve2_vert_dspr2(src, src_stride, 346233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst, dst_stride, 347233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_x, x_step_q4, 348233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_y, y_step_q4, 349233d2500723e5594f3e7c70896ffeeef32b9c950ywan w, h); 350233d2500723e5594f3e7c70896ffeeef32b9c950ywan } else { 351233d2500723e5594f3e7c70896ffeeef32b9c950ywan if (16 == y_step_q4) { 352233d2500723e5594f3e7c70896ffeeef32b9c950ywan uint32_t pos = 38; 353233d2500723e5594f3e7c70896ffeeef32b9c950ywan 354233d2500723e5594f3e7c70896ffeeef32b9c950ywan /* bit positon for extract from acc */ 355233d2500723e5594f3e7c70896ffeeef32b9c950ywan __asm__ __volatile__ ( 356233d2500723e5594f3e7c70896ffeeef32b9c950ywan "wrdsp %[pos], 1 \n\t" 357233d2500723e5594f3e7c70896ffeeef32b9c950ywan : 358233d2500723e5594f3e7c70896ffeeef32b9c950ywan : [pos] "r" (pos) 359233d2500723e5594f3e7c70896ffeeef32b9c950ywan ); 360233d2500723e5594f3e7c70896ffeeef32b9c950ywan 361233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_prefetch_store(dst); 362233d2500723e5594f3e7c70896ffeeef32b9c950ywan 363233d2500723e5594f3e7c70896ffeeef32b9c950ywan switch (w) { 364233d2500723e5594f3e7c70896ffeeef32b9c950ywan case 4 : 365233d2500723e5594f3e7c70896ffeeef32b9c950ywan case 8 : 366233d2500723e5594f3e7c70896ffeeef32b9c950ywan case 16 : 367233d2500723e5594f3e7c70896ffeeef32b9c950ywan case 32 : 368233d2500723e5594f3e7c70896ffeeef32b9c950ywan convolve_vert_4_dspr2(src, src_stride, 369233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst, dst_stride, 370233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_y, w, h); 371233d2500723e5594f3e7c70896ffeeef32b9c950ywan break; 372233d2500723e5594f3e7c70896ffeeef32b9c950ywan case 64 : 373233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_prefetch_store(dst + 32); 374233d2500723e5594f3e7c70896ffeeef32b9c950ywan convolve_vert_64_dspr2(src, src_stride, 375233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst, dst_stride, 376233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_y, h); 377233d2500723e5594f3e7c70896ffeeef32b9c950ywan break; 378233d2500723e5594f3e7c70896ffeeef32b9c950ywan default: 379233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_convolve8_vert_c(src, src_stride, 380233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst, dst_stride, 381233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_x, x_step_q4, 382233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_y, y_step_q4, 383233d2500723e5594f3e7c70896ffeeef32b9c950ywan w, h); 384233d2500723e5594f3e7c70896ffeeef32b9c950ywan break; 385233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 386233d2500723e5594f3e7c70896ffeeef32b9c950ywan } else { 387233d2500723e5594f3e7c70896ffeeef32b9c950ywan vp9_convolve8_vert_c(src, src_stride, 388233d2500723e5594f3e7c70896ffeeef32b9c950ywan dst, dst_stride, 389233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_x, x_step_q4, 390233d2500723e5594f3e7c70896ffeeef32b9c950ywan filter_y, y_step_q4, 391233d2500723e5594f3e7c70896ffeeef32b9c950ywan w, h); 392233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 393233d2500723e5594f3e7c70896ffeeef32b9c950ywan } 394233d2500723e5594f3e7c70896ffeeef32b9c950ywan} 395233d2500723e5594f3e7c70896ffeeef32b9c950ywan 396233d2500723e5594f3e7c70896ffeeef32b9c950ywan#endif 397