1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_dsp_common.h" 17#include "vpx_dsp/vpx_filter.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, 22 uint8_t *dst, int32_t dst_stride, 23 const int16_t *filter_x0, int32_t h) { 24 int32_t y; 25 uint8_t *cm = vpx_ff_cropTbl; 26 int32_t vector1b, vector2b, vector3b, vector4b; 27 int32_t Temp1, Temp2, Temp3, Temp4; 28 uint32_t vector4a = 64; 29 uint32_t tp1, tp2; 30 uint32_t p1, p2, p3, p4; 31 uint32_t n1, n2, n3, n4; 32 uint32_t tn1, tn2; 33 34 vector1b = ((const int32_t *)filter_x0)[0]; 35 vector2b = ((const int32_t *)filter_x0)[1]; 36 vector3b = ((const int32_t *)filter_x0)[2]; 37 vector4b = ((const int32_t *)filter_x0)[3]; 38 39 for (y = h; y--;) { 40 /* prefetch data to cache memory */ 41 prefetch_load(src + src_stride); 42 prefetch_load(src + src_stride + 32); 43 prefetch_store(dst + dst_stride); 44 45 __asm__ __volatile__( 46 "ulw %[tp1], 0(%[src]) \n\t" 47 "ulw %[tp2], 4(%[src]) \n\t" 48 49 /* even 1. pixel */ 50 "mtlo %[vector4a], $ac3 \n\t" 51 "mthi $zero, $ac3 \n\t" 52 "preceu.ph.qbr %[p1], %[tp1] \n\t" 53 "preceu.ph.qbl %[p2], %[tp1] \n\t" 54 "preceu.ph.qbr %[p3], %[tp2] \n\t" 55 "preceu.ph.qbl %[p4], %[tp2] \n\t" 56 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 57 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 58 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 59 "ulw %[tn2], 8(%[src]) \n\t" 60 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 61 "extp %[Temp1], $ac3, 31 \n\t" 62 63 /* even 2. pixel */ 64 "mtlo %[vector4a], $ac2 \n\t" 65 "mthi $zero, $ac2 \n\t" 66 "preceu.ph.qbr %[p1], %[tn2] \n\t" 67 "balign %[tn1], %[tn2], 3 \n\t" 68 "balign %[tn2], %[tp2], 3 \n\t" 69 "balign %[tp2], %[tp1], 3 \n\t" 70 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 71 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 72 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 73 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 74 "extp %[Temp3], $ac2, 31 \n\t" 75 76 /* odd 1. pixel */ 77 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 78 "mtlo %[vector4a], $ac3 \n\t" 79 "mthi $zero, $ac3 \n\t" 80 "preceu.ph.qbr %[n1], %[tp2] \n\t" 81 "preceu.ph.qbl %[n2], %[tp2] \n\t" 82 "preceu.ph.qbr %[n3], %[tn2] \n\t" 83 "preceu.ph.qbl %[n4], %[tn2] \n\t" 84 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 85 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 86 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 87 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" 88 "extp %[Temp2], $ac3, 31 \n\t" 89 90 /* odd 2. pixel */ 91 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 92 "mtlo %[vector4a], $ac2 \n\t" 93 "mthi $zero, $ac2 \n\t" 94 "preceu.ph.qbr %[n1], %[tn1] \n\t" 95 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 96 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 97 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 98 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" 99 "extp %[Temp4], $ac2, 31 \n\t" 100 101 /* clamp */ 102 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 103 "lbux %[n2], %[Temp4](%[cm]) \n\t" 104 105 /* store bytes */ 106 "sb %[tp1], 0(%[dst]) \n\t" 107 "sb %[tn1], 1(%[dst]) \n\t" 108 "sb %[tp2], 2(%[dst]) \n\t" 109 "sb %[n2], 3(%[dst]) \n\t" 110 111 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 112 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 113 [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), 114 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 115 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4) 116 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 117 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 118 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), 119 [src] "r"(src)); 120 121 /* Next row... */ 122 src += src_stride; 123 dst += dst_stride; 124 } 125} 126 127static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, 128 uint8_t *dst, int32_t dst_stride, 129 const int16_t *filter_x0, int32_t h) { 130 int32_t y; 131 uint8_t *cm = vpx_ff_cropTbl; 132 uint32_t vector4a = 64; 133 int32_t vector1b, vector2b, vector3b, vector4b; 134 int32_t Temp1, Temp2, Temp3; 135 uint32_t tp1, tp2; 136 uint32_t p1, p2, p3, p4, n1; 137 uint32_t tn1, tn2, tn3; 138 uint32_t st0, st1; 139 140 vector1b = ((const int32_t *)filter_x0)[0]; 141 vector2b = ((const int32_t *)filter_x0)[1]; 142 vector3b = ((const int32_t *)filter_x0)[2]; 143 vector4b = ((const int32_t *)filter_x0)[3]; 144 145 for (y = h; y--;) { 146 /* prefetch data to cache memory */ 147 prefetch_load(src + src_stride); 148 prefetch_load(src + src_stride + 32); 149 prefetch_store(dst + dst_stride); 150 151 __asm__ __volatile__( 152 "ulw %[tp1], 0(%[src]) \n\t" 153 "ulw %[tp2], 4(%[src]) \n\t" 154 155 /* even 1. pixel */ 156 "mtlo %[vector4a], $ac3 \n\t" 157 "mthi $zero, $ac3 \n\t" 158 "mtlo %[vector4a], $ac2 \n\t" 159 "mthi $zero, $ac2 \n\t" 160 "preceu.ph.qbr %[p1], %[tp1] \n\t" 161 "preceu.ph.qbl %[p2], %[tp1] \n\t" 162 "preceu.ph.qbr %[p3], %[tp2] \n\t" 163 "preceu.ph.qbl %[p4], %[tp2] \n\t" 164 "ulw %[tn2], 8(%[src]) \n\t" 165 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 166 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 167 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 168 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 169 "extp %[Temp1], $ac3, 31 \n\t" 170 171 /* even 2. pixel */ 172 "preceu.ph.qbr %[p1], %[tn2] \n\t" 173 "preceu.ph.qbl %[n1], %[tn2] \n\t" 174 "ulw %[tn1], 12(%[src]) \n\t" 175 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 176 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 177 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 178 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 179 "extp %[Temp3], $ac2, 31 \n\t" 180 181 /* even 3. pixel */ 182 "lbux %[st0], %[Temp1](%[cm]) \n\t" 183 "mtlo %[vector4a], $ac1 \n\t" 184 "mthi $zero, $ac1 \n\t" 185 "preceu.ph.qbr %[p2], %[tn1] \n\t" 186 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 187 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 188 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 189 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 190 "extp %[Temp1], $ac1, 31 \n\t" 191 192 /* even 4. pixel */ 193 "mtlo %[vector4a], $ac2 \n\t" 194 "mthi $zero, $ac2 \n\t" 195 "mtlo %[vector4a], $ac3 \n\t" 196 "mthi $zero, $ac3 \n\t" 197 "sb %[st0], 0(%[dst]) \n\t" 198 "lbux %[st1], %[Temp3](%[cm]) \n\t" 199 200 "balign %[tn3], %[tn1], 3 \n\t" 201 "balign %[tn1], %[tn2], 3 \n\t" 202 "balign %[tn2], %[tp2], 3 \n\t" 203 "balign %[tp2], %[tp1], 3 \n\t" 204 205 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 206 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 207 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 208 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 209 "extp %[Temp3], $ac2, 31 \n\t" 210 211 "lbux %[st0], %[Temp1](%[cm]) \n\t" 212 213 /* odd 1. pixel */ 214 "mtlo %[vector4a], $ac1 \n\t" 215 "mthi $zero, $ac1 \n\t" 216 "sb %[st1], 2(%[dst]) \n\t" 217 "preceu.ph.qbr %[p1], %[tp2] \n\t" 218 "preceu.ph.qbl %[p2], %[tp2] \n\t" 219 "preceu.ph.qbr %[p3], %[tn2] \n\t" 220 "preceu.ph.qbl %[p4], %[tn2] \n\t" 221 "sb %[st0], 4(%[dst]) \n\t" 222 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 223 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 224 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 225 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 226 "extp %[Temp2], $ac3, 31 \n\t" 227 228 /* odd 2. pixel */ 229 "mtlo %[vector4a], $ac3 \n\t" 230 "mthi $zero, $ac3 \n\t" 231 "mtlo %[vector4a], $ac2 \n\t" 232 "mthi $zero, $ac2 \n\t" 233 "preceu.ph.qbr %[p1], %[tn1] \n\t" 234 "preceu.ph.qbl %[n1], %[tn1] \n\t" 235 "lbux %[st0], %[Temp3](%[cm]) \n\t" 236 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 237 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 238 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 239 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 240 "extp %[Temp3], $ac1, 31 \n\t" 241 242 /* odd 3. pixel */ 243 "lbux %[st1], %[Temp2](%[cm]) \n\t" 244 "preceu.ph.qbr %[p2], %[tn3] \n\t" 245 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 246 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 247 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 248 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 249 "extp %[Temp2], $ac3, 31 \n\t" 250 251 /* odd 4. pixel */ 252 "sb %[st1], 1(%[dst]) \n\t" 253 "sb %[st0], 6(%[dst]) \n\t" 254 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 255 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 256 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 257 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 258 "extp %[Temp1], $ac2, 31 \n\t" 259 260 /* clamp */ 261 "lbux %[p4], %[Temp3](%[cm]) \n\t" 262 "lbux %[p2], %[Temp2](%[cm]) \n\t" 263 "lbux %[n1], %[Temp1](%[cm]) \n\t" 264 265 /* store bytes */ 266 "sb %[p4], 3(%[dst]) \n\t" 267 "sb %[p2], 5(%[dst]) \n\t" 268 "sb %[n1], 7(%[dst]) \n\t" 269 270 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 271 [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0), 272 [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 273 [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1), 274 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 275 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 276 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 277 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst), 278 [src] "r"(src)); 279 280 /* Next row... */ 281 src += src_stride; 282 dst += dst_stride; 283 } 284} 285 286static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride, 287 uint8_t *dst_ptr, int32_t dst_stride, 288 const int16_t *filter_x0, int32_t h, 289 int32_t count) { 290 int32_t y, c; 291 const uint8_t *src; 292 uint8_t *dst; 293 uint8_t *cm = vpx_ff_cropTbl; 294 uint32_t vector_64 = 64; 295 int32_t filter12, filter34, filter56, filter78; 296 int32_t Temp1, Temp2, Temp3; 297 uint32_t qload1, qload2, qload3; 298 uint32_t p1, p2, p3, p4, p5; 299 uint32_t st1, st2, st3; 300 301 filter12 = ((const int32_t *)filter_x0)[0]; 302 filter34 = ((const int32_t *)filter_x0)[1]; 303 filter56 = ((const int32_t *)filter_x0)[2]; 304 filter78 = ((const int32_t *)filter_x0)[3]; 305 306 for (y = h; y--;) { 307 src = src_ptr; 308 dst = dst_ptr; 309 310 /* prefetch data to cache memory */ 311 prefetch_load(src_ptr + src_stride); 312 prefetch_load(src_ptr + src_stride + 32); 313 prefetch_store(dst_ptr + dst_stride); 314 315 for (c = 0; c < count; c++) { 316 __asm__ __volatile__( 317 "ulw %[qload1], 0(%[src]) \n\t" 318 "ulw %[qload2], 4(%[src]) \n\t" 319 320 /* even 1. pixel */ 321 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 322 "mthi $zero, $ac1 \n\t" 323 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 324 "mthi $zero, $ac2 \n\t" 325 "preceu.ph.qbr %[p1], %[qload1] \n\t" 326 "preceu.ph.qbl %[p2], %[qload1] \n\t" 327 "preceu.ph.qbr %[p3], %[qload2] \n\t" 328 "preceu.ph.qbl %[p4], %[qload2] \n\t" 329 "ulw %[qload3], 8(%[src]) \n\t" 330 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 331 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 332 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 333 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 334 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 335 336 /* even 2. pixel */ 337 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 338 "mthi $zero, $ac3 \n\t" 339 "preceu.ph.qbr %[p1], %[qload3] \n\t" 340 "preceu.ph.qbl %[p5], %[qload3] \n\t" 341 "ulw %[qload1], 12(%[src]) \n\t" 342 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 343 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 344 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 345 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 346 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 347 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 348 349 /* even 3. pixel */ 350 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 351 "mthi $zero, $ac1 \n\t" 352 "preceu.ph.qbr %[p2], %[qload1] \n\t" 353 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 354 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 355 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 356 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 357 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 358 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 359 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 360 361 /* even 4. pixel */ 362 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 363 "mthi $zero, $ac2 \n\t" 364 "preceu.ph.qbl %[p3], %[qload1] \n\t" 365 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 366 "ulw %[qload2], 16(%[src]) \n\t" 367 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 368 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 369 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 370 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 371 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 372 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 373 374 /* even 5. pixel */ 375 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 376 "mthi $zero, $ac3 \n\t" 377 "preceu.ph.qbr %[p4], %[qload2] \n\t" 378 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 379 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 380 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 381 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 382 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 383 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 384 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 385 386 /* even 6. pixel */ 387 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 388 "mthi $zero, $ac1 \n\t" 389 "preceu.ph.qbl %[p1], %[qload2] \n\t" 390 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 391 "ulw %[qload3], 20(%[src]) \n\t" 392 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 393 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 394 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 395 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 396 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 397 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 398 399 /* even 7. pixel */ 400 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 401 "mthi $zero, $ac2 \n\t" 402 "preceu.ph.qbr %[p5], %[qload3] \n\t" 403 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 404 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 405 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 406 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 407 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 408 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 409 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 410 411 /* even 8. pixel */ 412 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 413 "mthi $zero, $ac3 \n\t" 414 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 415 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 416 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 417 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 418 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 419 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 420 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 421 422 /* ODD pixels */ 423 "ulw %[qload1], 1(%[src]) \n\t" 424 "ulw %[qload2], 5(%[src]) \n\t" 425 426 /* odd 1. pixel */ 427 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 428 "mthi $zero, $ac1 \n\t" 429 "preceu.ph.qbr %[p1], %[qload1] \n\t" 430 "preceu.ph.qbl %[p2], %[qload1] \n\t" 431 "preceu.ph.qbr %[p3], %[qload2] \n\t" 432 "preceu.ph.qbl %[p4], %[qload2] \n\t" 433 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 434 "ulw %[qload3], 9(%[src]) \n\t" 435 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 436 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 437 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 438 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 439 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 440 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 441 442 /* odd 2. pixel */ 443 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 444 "mthi $zero, $ac2 \n\t" 445 "preceu.ph.qbr %[p1], %[qload3] \n\t" 446 "preceu.ph.qbl %[p5], %[qload3] \n\t" 447 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 448 "ulw %[qload1], 13(%[src]) \n\t" 449 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 450 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 451 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 452 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 453 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 454 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 455 456 /* odd 3. pixel */ 457 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 458 "mthi $zero, $ac3 \n\t" 459 "preceu.ph.qbr %[p2], %[qload1] \n\t" 460 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 461 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 462 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 463 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 464 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 465 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 466 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 467 468 /* odd 4. pixel */ 469 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 470 "mthi $zero, $ac1 \n\t" 471 "preceu.ph.qbl %[p3], %[qload1] \n\t" 472 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 473 "ulw %[qload2], 17(%[src]) \n\t" 474 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 475 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 476 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 477 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 478 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 479 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 480 481 /* odd 5. pixel */ 482 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 483 "mthi $zero, $ac2 \n\t" 484 "preceu.ph.qbr %[p4], %[qload2] \n\t" 485 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 486 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 487 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 488 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 489 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 490 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 491 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 492 493 /* odd 6. pixel */ 494 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 495 "mthi $zero, $ac3 \n\t" 496 "preceu.ph.qbl %[p1], %[qload2] \n\t" 497 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 498 "ulw %[qload3], 21(%[src]) \n\t" 499 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 500 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 501 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 502 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 503 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 504 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 505 506 /* odd 7. pixel */ 507 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 508 "mthi $zero, $ac1 \n\t" 509 "preceu.ph.qbr %[p5], %[qload3] \n\t" 510 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 511 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 512 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 513 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 514 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 515 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 516 517 /* odd 8. pixel */ 518 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 519 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 520 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 521 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 522 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 523 524 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 525 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 526 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 527 528 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 529 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 530 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 531 532 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), 533 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), 534 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 535 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), 536 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 537 : [filter12] "r"(filter12), [filter34] "r"(filter34), 538 [filter56] "r"(filter56), [filter78] "r"(filter78), 539 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), 540 [src] "r"(src)); 541 542 src += 16; 543 dst += 16; 544 } 545 546 /* Next row... */ 547 src_ptr += src_stride; 548 dst_ptr += dst_stride; 549 } 550} 551 552static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, 553 uint8_t *dst_ptr, int32_t dst_stride, 554 const int16_t *filter_x0, int32_t h) { 555 int32_t y, c; 556 const uint8_t *src; 557 uint8_t *dst; 558 uint8_t *cm = vpx_ff_cropTbl; 559 uint32_t vector_64 = 64; 560 int32_t filter12, filter34, filter56, filter78; 561 int32_t Temp1, Temp2, Temp3; 562 uint32_t qload1, qload2, qload3; 563 uint32_t p1, p2, p3, p4, p5; 564 uint32_t st1, st2, st3; 565 566 filter12 = ((const int32_t *)filter_x0)[0]; 567 filter34 = ((const int32_t *)filter_x0)[1]; 568 filter56 = ((const int32_t *)filter_x0)[2]; 569 filter78 = ((const int32_t *)filter_x0)[3]; 570 571 for (y = h; y--;) { 572 src = src_ptr; 573 dst = dst_ptr; 574 575 /* prefetch data to cache memory */ 576 prefetch_load(src_ptr + src_stride); 577 prefetch_load(src_ptr + src_stride + 32); 578 prefetch_load(src_ptr + src_stride + 64); 579 prefetch_store(dst_ptr + dst_stride); 580 prefetch_store(dst_ptr + dst_stride + 32); 581 582 for (c = 0; c < 4; c++) { 583 __asm__ __volatile__( 584 "ulw %[qload1], 0(%[src]) \n\t" 585 "ulw %[qload2], 4(%[src]) \n\t" 586 587 /* even 1. pixel */ 588 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 589 "mthi $zero, $ac1 \n\t" 590 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 591 "mthi $zero, $ac2 \n\t" 592 "preceu.ph.qbr %[p1], %[qload1] \n\t" 593 "preceu.ph.qbl %[p2], %[qload1] \n\t" 594 "preceu.ph.qbr %[p3], %[qload2] \n\t" 595 "preceu.ph.qbl %[p4], %[qload2] \n\t" 596 "ulw %[qload3], 8(%[src]) \n\t" 597 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 598 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 599 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 600 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 601 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 602 603 /* even 2. pixel */ 604 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 605 "mthi $zero, $ac3 \n\t" 606 "preceu.ph.qbr %[p1], %[qload3] \n\t" 607 "preceu.ph.qbl %[p5], %[qload3] \n\t" 608 "ulw %[qload1], 12(%[src]) \n\t" 609 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 610 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 611 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 612 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 613 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 614 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 615 616 /* even 3. pixel */ 617 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 618 "mthi $zero, $ac1 \n\t" 619 "preceu.ph.qbr %[p2], %[qload1] \n\t" 620 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 621 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 622 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 623 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 624 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 625 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 626 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 627 628 /* even 4. pixel */ 629 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 630 "mthi $zero, $ac2 \n\t" 631 "preceu.ph.qbl %[p3], %[qload1] \n\t" 632 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 633 "ulw %[qload2], 16(%[src]) \n\t" 634 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 635 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 636 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 637 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 638 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 639 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 640 641 /* even 5. pixel */ 642 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 643 "mthi $zero, $ac3 \n\t" 644 "preceu.ph.qbr %[p4], %[qload2] \n\t" 645 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 646 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 647 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 648 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 649 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 650 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 651 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 652 653 /* even 6. pixel */ 654 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 655 "mthi $zero, $ac1 \n\t" 656 "preceu.ph.qbl %[p1], %[qload2] \n\t" 657 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 658 "ulw %[qload3], 20(%[src]) \n\t" 659 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 660 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 661 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 662 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 663 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 664 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 665 666 /* even 7. pixel */ 667 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 668 "mthi $zero, $ac2 \n\t" 669 "preceu.ph.qbr %[p5], %[qload3] \n\t" 670 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 671 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 672 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 673 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 674 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 675 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 676 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 677 678 /* even 8. pixel */ 679 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 680 "mthi $zero, $ac3 \n\t" 681 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 682 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 683 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 684 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 685 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 686 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 687 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 688 689 /* ODD pixels */ 690 "ulw %[qload1], 1(%[src]) \n\t" 691 "ulw %[qload2], 5(%[src]) \n\t" 692 693 /* odd 1. pixel */ 694 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 695 "mthi $zero, $ac1 \n\t" 696 "preceu.ph.qbr %[p1], %[qload1] \n\t" 697 "preceu.ph.qbl %[p2], %[qload1] \n\t" 698 "preceu.ph.qbr %[p3], %[qload2] \n\t" 699 "preceu.ph.qbl %[p4], %[qload2] \n\t" 700 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 701 "ulw %[qload3], 9(%[src]) \n\t" 702 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 703 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 704 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 705 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 706 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 707 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 708 709 /* odd 2. pixel */ 710 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 711 "mthi $zero, $ac2 \n\t" 712 "preceu.ph.qbr %[p1], %[qload3] \n\t" 713 "preceu.ph.qbl %[p5], %[qload3] \n\t" 714 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 715 "ulw %[qload1], 13(%[src]) \n\t" 716 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 717 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 718 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 719 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 720 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 721 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 722 723 /* odd 3. pixel */ 724 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 725 "mthi $zero, $ac3 \n\t" 726 "preceu.ph.qbr %[p2], %[qload1] \n\t" 727 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 728 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 729 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 730 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 731 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 732 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 733 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 734 735 /* odd 4. pixel */ 736 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 737 "mthi $zero, $ac1 \n\t" 738 "preceu.ph.qbl %[p3], %[qload1] \n\t" 739 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 740 "ulw %[qload2], 17(%[src]) \n\t" 741 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 742 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 743 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 744 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 745 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 746 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 747 748 /* odd 5. pixel */ 749 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 750 "mthi $zero, $ac2 \n\t" 751 "preceu.ph.qbr %[p4], %[qload2] \n\t" 752 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 753 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 754 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 755 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 756 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 757 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 758 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 759 760 /* odd 6. pixel */ 761 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 762 "mthi $zero, $ac3 \n\t" 763 "preceu.ph.qbl %[p1], %[qload2] \n\t" 764 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 765 "ulw %[qload3], 21(%[src]) \n\t" 766 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 767 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 768 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 769 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 770 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 771 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 772 773 /* odd 7. pixel */ 774 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 775 "mthi $zero, $ac1 \n\t" 776 "preceu.ph.qbr %[p5], %[qload3] \n\t" 777 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 778 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 779 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 780 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 781 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 782 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 783 784 /* odd 8. pixel */ 785 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 786 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 787 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 788 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 789 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 790 791 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 792 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 793 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 794 795 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 796 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 797 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 798 799 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), 800 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), 801 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 802 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), 803 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 804 : [filter12] "r"(filter12), [filter34] "r"(filter34), 805 [filter56] "r"(filter56), [filter78] "r"(filter78), 806 [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst), 807 [src] "r"(src)); 808 809 src += 16; 810 dst += 16; 811 } 812 813 /* Next row... */ 814 src_ptr += src_stride; 815 dst_ptr += dst_stride; 816 } 817} 818 819void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 820 uint8_t *dst, ptrdiff_t dst_stride, 821 const int16_t *filter_x, int x_step_q4, 822 const int16_t *filter_y, int y_step_q4, int w, 823 int h) { 824 assert(x_step_q4 == 16); 825 assert(((const int32_t *)filter_x)[1] != 0x800000); 826 827 if (((const int32_t *)filter_x)[0] == 0) { 828 vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, 829 x_step_q4, filter_y, y_step_q4, w, h); 830 } else { 831 uint32_t pos = 38; 832 833 prefetch_load((const uint8_t *)filter_x); 834 src -= 3; 835 836 /* bit positon for extract from acc */ 837 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 838 : 839 : [pos] "r"(pos)); 840 841 /* prefetch data to cache memory */ 842 prefetch_load(src); 843 prefetch_load(src + 32); 844 prefetch_store(dst); 845 846 switch (w) { 847 case 4: 848 convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst, 849 (int32_t)dst_stride, filter_x, (int32_t)h); 850 break; 851 case 8: 852 convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst, 853 (int32_t)dst_stride, filter_x, (int32_t)h); 854 break; 855 case 16: 856 convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, 857 (int32_t)dst_stride, filter_x, (int32_t)h, 1); 858 break; 859 case 32: 860 convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst, 861 (int32_t)dst_stride, filter_x, (int32_t)h, 2); 862 break; 863 case 64: 864 prefetch_load(src + 64); 865 prefetch_store(dst + 32); 866 867 convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst, 868 (int32_t)dst_stride, filter_x, (int32_t)h); 869 break; 870 default: 871 vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, 872 x_step_q4, filter_y, y_step_q4, w, h); 873 break; 874 } 875 } 876} 877#endif 878