convolve8_horiz_dspr2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_dsp_common.h" 17#include "vpx_dsp/vpx_filter.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_horiz_4_dspr2(const uint8_t *src, 22 int32_t src_stride, 23 uint8_t *dst, 24 int32_t dst_stride, 25 const int16_t *filter_x0, 26 int32_t h) { 27 int32_t y; 28 uint8_t *cm = vpx_ff_cropTbl; 29 int32_t vector1b, vector2b, vector3b, vector4b; 30 int32_t Temp1, Temp2, Temp3, Temp4; 31 uint32_t vector4a = 64; 32 uint32_t tp1, tp2; 33 uint32_t p1, p2, p3, p4; 34 uint32_t n1, n2, n3, n4; 35 uint32_t tn1, tn2; 36 37 vector1b = ((const int32_t *)filter_x0)[0]; 38 vector2b = ((const int32_t *)filter_x0)[1]; 39 vector3b = ((const int32_t *)filter_x0)[2]; 40 vector4b = ((const int32_t *)filter_x0)[3]; 41 42 for (y = h; y--;) { 43 /* prefetch data to cache memory */ 44 prefetch_load(src + src_stride); 45 prefetch_load(src + src_stride + 32); 46 prefetch_store(dst + dst_stride); 47 48 __asm__ __volatile__ ( 49 "ulw %[tp1], 0(%[src]) \n\t" 50 "ulw %[tp2], 4(%[src]) \n\t" 51 52 /* even 1. pixel */ 53 "mtlo %[vector4a], $ac3 \n\t" 54 "mthi $zero, $ac3 \n\t" 55 "preceu.ph.qbr %[p1], %[tp1] \n\t" 56 "preceu.ph.qbl %[p2], %[tp1] \n\t" 57 "preceu.ph.qbr %[p3], %[tp2] \n\t" 58 "preceu.ph.qbl %[p4], %[tp2] \n\t" 59 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 60 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 61 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 62 "ulw %[tn2], 8(%[src]) \n\t" 63 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 64 "extp %[Temp1], $ac3, 31 \n\t" 65 66 /* even 2. pixel */ 67 "mtlo %[vector4a], $ac2 \n\t" 68 "mthi $zero, $ac2 \n\t" 69 "preceu.ph.qbr %[p1], %[tn2] \n\t" 70 "balign %[tn1], %[tn2], 3 \n\t" 71 "balign %[tn2], %[tp2], 3 \n\t" 72 "balign %[tp2], %[tp1], 3 \n\t" 73 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 74 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 75 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 76 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 77 "extp %[Temp3], $ac2, 31 \n\t" 78 79 /* odd 1. pixel */ 80 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 81 "mtlo %[vector4a], $ac3 \n\t" 82 "mthi $zero, $ac3 \n\t" 83 "preceu.ph.qbr %[n1], %[tp2] \n\t" 84 "preceu.ph.qbl %[n2], %[tp2] \n\t" 85 "preceu.ph.qbr %[n3], %[tn2] \n\t" 86 "preceu.ph.qbl %[n4], %[tn2] \n\t" 87 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 88 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 89 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 90 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" 91 "extp %[Temp2], $ac3, 31 \n\t" 92 93 /* odd 2. pixel */ 94 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 95 "mtlo %[vector4a], $ac2 \n\t" 96 "mthi $zero, $ac2 \n\t" 97 "preceu.ph.qbr %[n1], %[tn1] \n\t" 98 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 99 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 100 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 101 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" 102 "extp %[Temp4], $ac2, 31 \n\t" 103 104 /* clamp */ 105 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 106 "lbux %[n2], %[Temp4](%[cm]) \n\t" 107 108 /* store bytes */ 109 "sb %[tp1], 0(%[dst]) \n\t" 110 "sb %[tn1], 1(%[dst]) \n\t" 111 "sb %[tp2], 2(%[dst]) \n\t" 112 "sb %[n2], 3(%[dst]) \n\t" 113 114 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 115 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), 116 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 117 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), 118 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 119 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) 120 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 121 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 122 [vector4a] "r" (vector4a), 123 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 124 ); 125 126 /* Next row... */ 127 src += src_stride; 128 dst += dst_stride; 129 } 130} 131 132static void convolve_horiz_8_dspr2(const uint8_t *src, 133 int32_t src_stride, 134 uint8_t *dst, 135 int32_t dst_stride, 136 const int16_t *filter_x0, 137 int32_t h) { 138 int32_t y; 139 uint8_t *cm = vpx_ff_cropTbl; 140 uint32_t vector4a = 64; 141 int32_t vector1b, vector2b, vector3b, vector4b; 142 int32_t Temp1, Temp2, Temp3; 143 uint32_t tp1, tp2; 144 uint32_t p1, p2, p3, p4, n1; 145 uint32_t tn1, tn2, tn3; 146 uint32_t st0, st1; 147 148 vector1b = ((const int32_t *)filter_x0)[0]; 149 vector2b = ((const int32_t *)filter_x0)[1]; 150 vector3b = ((const int32_t *)filter_x0)[2]; 151 vector4b = ((const int32_t *)filter_x0)[3]; 152 153 for (y = h; y--;) { 154 /* prefetch data to cache memory */ 155 prefetch_load(src + src_stride); 156 prefetch_load(src + src_stride + 32); 157 prefetch_store(dst + dst_stride); 158 159 __asm__ __volatile__ ( 160 "ulw %[tp1], 0(%[src]) \n\t" 161 "ulw %[tp2], 4(%[src]) \n\t" 162 163 /* even 1. pixel */ 164 "mtlo %[vector4a], $ac3 \n\t" 165 "mthi $zero, $ac3 \n\t" 166 "mtlo %[vector4a], $ac2 \n\t" 167 "mthi $zero, $ac2 \n\t" 168 "preceu.ph.qbr %[p1], %[tp1] \n\t" 169 "preceu.ph.qbl %[p2], %[tp1] \n\t" 170 "preceu.ph.qbr %[p3], %[tp2] \n\t" 171 "preceu.ph.qbl %[p4], %[tp2] \n\t" 172 "ulw %[tn2], 8(%[src]) \n\t" 173 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 174 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 175 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 176 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 177 "extp %[Temp1], $ac3, 31 \n\t" 178 179 /* even 2. pixel */ 180 "preceu.ph.qbr %[p1], %[tn2] \n\t" 181 "preceu.ph.qbl %[n1], %[tn2] \n\t" 182 "ulw %[tn1], 12(%[src]) \n\t" 183 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 184 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 185 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 186 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 187 "extp %[Temp3], $ac2, 31 \n\t" 188 189 /* even 3. pixel */ 190 "lbux %[st0], %[Temp1](%[cm]) \n\t" 191 "mtlo %[vector4a], $ac1 \n\t" 192 "mthi $zero, $ac1 \n\t" 193 "preceu.ph.qbr %[p2], %[tn1] \n\t" 194 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 195 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 196 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 197 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 198 "extp %[Temp1], $ac1, 31 \n\t" 199 200 /* even 4. pixel */ 201 "mtlo %[vector4a], $ac2 \n\t" 202 "mthi $zero, $ac2 \n\t" 203 "mtlo %[vector4a], $ac3 \n\t" 204 "mthi $zero, $ac3 \n\t" 205 "sb %[st0], 0(%[dst]) \n\t" 206 "lbux %[st1], %[Temp3](%[cm]) \n\t" 207 208 "balign %[tn3], %[tn1], 3 \n\t" 209 "balign %[tn1], %[tn2], 3 \n\t" 210 "balign %[tn2], %[tp2], 3 \n\t" 211 "balign %[tp2], %[tp1], 3 \n\t" 212 213 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 214 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 215 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 216 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 217 "extp %[Temp3], $ac2, 31 \n\t" 218 219 "lbux %[st0], %[Temp1](%[cm]) \n\t" 220 221 /* odd 1. pixel */ 222 "mtlo %[vector4a], $ac1 \n\t" 223 "mthi $zero, $ac1 \n\t" 224 "sb %[st1], 2(%[dst]) \n\t" 225 "preceu.ph.qbr %[p1], %[tp2] \n\t" 226 "preceu.ph.qbl %[p2], %[tp2] \n\t" 227 "preceu.ph.qbr %[p3], %[tn2] \n\t" 228 "preceu.ph.qbl %[p4], %[tn2] \n\t" 229 "sb %[st0], 4(%[dst]) \n\t" 230 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 231 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 232 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 233 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 234 "extp %[Temp2], $ac3, 31 \n\t" 235 236 /* odd 2. pixel */ 237 "mtlo %[vector4a], $ac3 \n\t" 238 "mthi $zero, $ac3 \n\t" 239 "mtlo %[vector4a], $ac2 \n\t" 240 "mthi $zero, $ac2 \n\t" 241 "preceu.ph.qbr %[p1], %[tn1] \n\t" 242 "preceu.ph.qbl %[n1], %[tn1] \n\t" 243 "lbux %[st0], %[Temp3](%[cm]) \n\t" 244 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 245 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 246 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 247 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 248 "extp %[Temp3], $ac1, 31 \n\t" 249 250 /* odd 3. pixel */ 251 "lbux %[st1], %[Temp2](%[cm]) \n\t" 252 "preceu.ph.qbr %[p2], %[tn3] \n\t" 253 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 254 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 255 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 256 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 257 "extp %[Temp2], $ac3, 31 \n\t" 258 259 /* odd 4. pixel */ 260 "sb %[st1], 1(%[dst]) \n\t" 261 "sb %[st0], 6(%[dst]) \n\t" 262 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 263 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 264 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 265 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 266 "extp %[Temp1], $ac2, 31 \n\t" 267 268 /* clamp */ 269 "lbux %[p4], %[Temp3](%[cm]) \n\t" 270 "lbux %[p2], %[Temp2](%[cm]) \n\t" 271 "lbux %[n1], %[Temp1](%[cm]) \n\t" 272 273 /* store bytes */ 274 "sb %[p4], 3(%[dst]) \n\t" 275 "sb %[p2], 5(%[dst]) \n\t" 276 "sb %[n1], 7(%[dst]) \n\t" 277 278 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 279 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), 280 [st0] "=&r" (st0), [st1] "=&r" (st1), 281 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 282 [n1] "=&r" (n1), 283 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 284 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 285 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 286 [vector4a] "r" (vector4a), 287 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 288 ); 289 290 /* Next row... */ 291 src += src_stride; 292 dst += dst_stride; 293 } 294} 295 296static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, 297 int32_t src_stride, 298 uint8_t *dst_ptr, 299 int32_t dst_stride, 300 const int16_t *filter_x0, 301 int32_t h, 302 int32_t count) { 303 int32_t y, c; 304 const uint8_t *src; 305 uint8_t *dst; 306 uint8_t *cm = vpx_ff_cropTbl; 307 uint32_t vector_64 = 64; 308 int32_t filter12, filter34, filter56, filter78; 309 int32_t Temp1, Temp2, Temp3; 310 uint32_t qload1, qload2, qload3; 311 uint32_t p1, p2, p3, p4, p5; 312 uint32_t st1, st2, st3; 313 314 filter12 = ((const int32_t *)filter_x0)[0]; 315 filter34 = ((const int32_t *)filter_x0)[1]; 316 filter56 = ((const int32_t *)filter_x0)[2]; 317 filter78 = ((const int32_t *)filter_x0)[3]; 318 319 for (y = h; y--;) { 320 src = src_ptr; 321 dst = dst_ptr; 322 323 /* prefetch data to cache memory */ 324 prefetch_load(src_ptr + src_stride); 325 prefetch_load(src_ptr + src_stride + 32); 326 prefetch_store(dst_ptr + dst_stride); 327 328 for (c = 0; c < count; c++) { 329 __asm__ __volatile__ ( 330 "ulw %[qload1], 0(%[src]) \n\t" 331 "ulw %[qload2], 4(%[src]) \n\t" 332 333 /* even 1. pixel */ 334 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 335 "mthi $zero, $ac1 \n\t" 336 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 337 "mthi $zero, $ac2 \n\t" 338 "preceu.ph.qbr %[p1], %[qload1] \n\t" 339 "preceu.ph.qbl %[p2], %[qload1] \n\t" 340 "preceu.ph.qbr %[p3], %[qload2] \n\t" 341 "preceu.ph.qbl %[p4], %[qload2] \n\t" 342 "ulw %[qload3], 8(%[src]) \n\t" 343 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 344 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 345 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 346 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 347 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 348 349 /* even 2. pixel */ 350 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 351 "mthi $zero, $ac3 \n\t" 352 "preceu.ph.qbr %[p1], %[qload3] \n\t" 353 "preceu.ph.qbl %[p5], %[qload3] \n\t" 354 "ulw %[qload1], 12(%[src]) \n\t" 355 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 356 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 357 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 358 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 359 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 360 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 361 362 /* even 3. pixel */ 363 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 364 "mthi $zero, $ac1 \n\t" 365 "preceu.ph.qbr %[p2], %[qload1] \n\t" 366 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 367 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 368 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 369 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 370 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 371 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 372 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 373 374 /* even 4. pixel */ 375 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 376 "mthi $zero, $ac2 \n\t" 377 "preceu.ph.qbl %[p3], %[qload1] \n\t" 378 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 379 "ulw %[qload2], 16(%[src]) \n\t" 380 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 381 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 382 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 383 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 384 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 385 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 386 387 /* even 5. pixel */ 388 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 389 "mthi $zero, $ac3 \n\t" 390 "preceu.ph.qbr %[p4], %[qload2] \n\t" 391 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 392 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 393 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 394 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 395 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 396 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 397 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 398 399 /* even 6. pixel */ 400 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 401 "mthi $zero, $ac1 \n\t" 402 "preceu.ph.qbl %[p1], %[qload2] \n\t" 403 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 404 "ulw %[qload3], 20(%[src]) \n\t" 405 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 406 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 407 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 408 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 409 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 410 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 411 412 /* even 7. pixel */ 413 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 414 "mthi $zero, $ac2 \n\t" 415 "preceu.ph.qbr %[p5], %[qload3] \n\t" 416 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 417 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 418 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 419 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 420 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 421 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 422 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 423 424 /* even 8. pixel */ 425 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 426 "mthi $zero, $ac3 \n\t" 427 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 428 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 429 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 430 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 431 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 432 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 433 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 434 435 /* ODD pixels */ 436 "ulw %[qload1], 1(%[src]) \n\t" 437 "ulw %[qload2], 5(%[src]) \n\t" 438 439 /* odd 1. pixel */ 440 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 441 "mthi $zero, $ac1 \n\t" 442 "preceu.ph.qbr %[p1], %[qload1] \n\t" 443 "preceu.ph.qbl %[p2], %[qload1] \n\t" 444 "preceu.ph.qbr %[p3], %[qload2] \n\t" 445 "preceu.ph.qbl %[p4], %[qload2] \n\t" 446 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 447 "ulw %[qload3], 9(%[src]) \n\t" 448 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 449 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 450 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 451 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 452 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 453 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 454 455 /* odd 2. pixel */ 456 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 457 "mthi $zero, $ac2 \n\t" 458 "preceu.ph.qbr %[p1], %[qload3] \n\t" 459 "preceu.ph.qbl %[p5], %[qload3] \n\t" 460 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 461 "ulw %[qload1], 13(%[src]) \n\t" 462 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 463 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 464 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 465 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 466 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 467 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 468 469 /* odd 3. pixel */ 470 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 471 "mthi $zero, $ac3 \n\t" 472 "preceu.ph.qbr %[p2], %[qload1] \n\t" 473 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 474 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 475 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 476 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 477 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 478 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 479 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 480 481 /* odd 4. pixel */ 482 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 483 "mthi $zero, $ac1 \n\t" 484 "preceu.ph.qbl %[p3], %[qload1] \n\t" 485 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 486 "ulw %[qload2], 17(%[src]) \n\t" 487 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 488 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 489 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 490 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 491 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 492 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 493 494 /* odd 5. pixel */ 495 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 496 "mthi $zero, $ac2 \n\t" 497 "preceu.ph.qbr %[p4], %[qload2] \n\t" 498 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 499 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 500 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 501 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 502 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 503 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 504 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 505 506 /* odd 6. pixel */ 507 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 508 "mthi $zero, $ac3 \n\t" 509 "preceu.ph.qbl %[p1], %[qload2] \n\t" 510 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 511 "ulw %[qload3], 21(%[src]) \n\t" 512 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 513 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 514 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 515 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 516 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 517 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 518 519 /* odd 7. pixel */ 520 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 521 "mthi $zero, $ac1 \n\t" 522 "preceu.ph.qbr %[p5], %[qload3] \n\t" 523 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 524 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 525 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 526 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 527 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 528 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 529 530 /* odd 8. pixel */ 531 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 532 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 533 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 534 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 535 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 536 537 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 538 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 539 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 540 541 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 542 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 543 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 544 545 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), 546 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 547 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 548 [p5] "=&r" (p5), 549 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 550 : [filter12] "r" (filter12), [filter34] "r" (filter34), 551 [filter56] "r" (filter56), [filter78] "r" (filter78), 552 [vector_64] "r" (vector_64), 553 [cm] "r" (cm), [dst] "r" (dst), 554 [src] "r" (src) 555 ); 556 557 src += 16; 558 dst += 16; 559 } 560 561 /* Next row... */ 562 src_ptr += src_stride; 563 dst_ptr += dst_stride; 564 } 565} 566 567static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, 568 int32_t src_stride, 569 uint8_t *dst_ptr, 570 int32_t dst_stride, 571 const int16_t *filter_x0, 572 int32_t h) { 573 int32_t y, c; 574 const uint8_t *src; 575 uint8_t *dst; 576 uint8_t *cm = vpx_ff_cropTbl; 577 uint32_t vector_64 = 64; 578 int32_t filter12, filter34, filter56, filter78; 579 int32_t Temp1, Temp2, Temp3; 580 uint32_t qload1, qload2, qload3; 581 uint32_t p1, p2, p3, p4, p5; 582 uint32_t st1, st2, st3; 583 584 filter12 = ((const int32_t *)filter_x0)[0]; 585 filter34 = ((const int32_t *)filter_x0)[1]; 586 filter56 = ((const int32_t *)filter_x0)[2]; 587 filter78 = ((const int32_t *)filter_x0)[3]; 588 589 for (y = h; y--;) { 590 src = src_ptr; 591 dst = dst_ptr; 592 593 /* prefetch data to cache memory */ 594 prefetch_load(src_ptr + src_stride); 595 prefetch_load(src_ptr + src_stride + 32); 596 prefetch_load(src_ptr + src_stride + 64); 597 prefetch_store(dst_ptr + dst_stride); 598 prefetch_store(dst_ptr + dst_stride + 32); 599 600 for (c = 0; c < 4; c++) { 601 __asm__ __volatile__ ( 602 "ulw %[qload1], 0(%[src]) \n\t" 603 "ulw %[qload2], 4(%[src]) \n\t" 604 605 /* even 1. pixel */ 606 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 607 "mthi $zero, $ac1 \n\t" 608 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 609 "mthi $zero, $ac2 \n\t" 610 "preceu.ph.qbr %[p1], %[qload1] \n\t" 611 "preceu.ph.qbl %[p2], %[qload1] \n\t" 612 "preceu.ph.qbr %[p3], %[qload2] \n\t" 613 "preceu.ph.qbl %[p4], %[qload2] \n\t" 614 "ulw %[qload3], 8(%[src]) \n\t" 615 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 616 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 617 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 618 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 619 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 620 621 /* even 2. pixel */ 622 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 623 "mthi $zero, $ac3 \n\t" 624 "preceu.ph.qbr %[p1], %[qload3] \n\t" 625 "preceu.ph.qbl %[p5], %[qload3] \n\t" 626 "ulw %[qload1], 12(%[src]) \n\t" 627 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 628 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 629 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 630 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 631 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 632 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 633 634 /* even 3. pixel */ 635 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 636 "mthi $zero, $ac1 \n\t" 637 "preceu.ph.qbr %[p2], %[qload1] \n\t" 638 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 639 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 640 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 641 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 642 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 643 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 644 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 645 646 /* even 4. pixel */ 647 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 648 "mthi $zero, $ac2 \n\t" 649 "preceu.ph.qbl %[p3], %[qload1] \n\t" 650 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 651 "ulw %[qload2], 16(%[src]) \n\t" 652 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 653 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 654 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 655 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 656 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 657 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 658 659 /* even 5. pixel */ 660 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 661 "mthi $zero, $ac3 \n\t" 662 "preceu.ph.qbr %[p4], %[qload2] \n\t" 663 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 664 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 665 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 666 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 667 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 668 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 669 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 670 671 /* even 6. pixel */ 672 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 673 "mthi $zero, $ac1 \n\t" 674 "preceu.ph.qbl %[p1], %[qload2] \n\t" 675 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 676 "ulw %[qload3], 20(%[src]) \n\t" 677 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 678 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 679 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 680 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 681 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 682 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 683 684 /* even 7. pixel */ 685 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 686 "mthi $zero, $ac2 \n\t" 687 "preceu.ph.qbr %[p5], %[qload3] \n\t" 688 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 689 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 690 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 691 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 692 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 693 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 694 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 695 696 /* even 8. pixel */ 697 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 698 "mthi $zero, $ac3 \n\t" 699 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 700 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 701 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 702 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 703 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 704 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 705 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 706 707 /* ODD pixels */ 708 "ulw %[qload1], 1(%[src]) \n\t" 709 "ulw %[qload2], 5(%[src]) \n\t" 710 711 /* odd 1. pixel */ 712 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 713 "mthi $zero, $ac1 \n\t" 714 "preceu.ph.qbr %[p1], %[qload1] \n\t" 715 "preceu.ph.qbl %[p2], %[qload1] \n\t" 716 "preceu.ph.qbr %[p3], %[qload2] \n\t" 717 "preceu.ph.qbl %[p4], %[qload2] \n\t" 718 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 719 "ulw %[qload3], 9(%[src]) \n\t" 720 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 721 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 722 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 723 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 724 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 725 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 726 727 /* odd 2. pixel */ 728 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 729 "mthi $zero, $ac2 \n\t" 730 "preceu.ph.qbr %[p1], %[qload3] \n\t" 731 "preceu.ph.qbl %[p5], %[qload3] \n\t" 732 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 733 "ulw %[qload1], 13(%[src]) \n\t" 734 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 735 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 736 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 737 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 738 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 739 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 740 741 /* odd 3. pixel */ 742 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 743 "mthi $zero, $ac3 \n\t" 744 "preceu.ph.qbr %[p2], %[qload1] \n\t" 745 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 746 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 747 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 748 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 749 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 750 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 751 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 752 753 /* odd 4. pixel */ 754 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 755 "mthi $zero, $ac1 \n\t" 756 "preceu.ph.qbl %[p3], %[qload1] \n\t" 757 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 758 "ulw %[qload2], 17(%[src]) \n\t" 759 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 760 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 761 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 762 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 763 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 764 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 765 766 /* odd 5. pixel */ 767 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 768 "mthi $zero, $ac2 \n\t" 769 "preceu.ph.qbr %[p4], %[qload2] \n\t" 770 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 771 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 772 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 773 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 774 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 775 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 776 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 777 778 /* odd 6. pixel */ 779 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 780 "mthi $zero, $ac3 \n\t" 781 "preceu.ph.qbl %[p1], %[qload2] \n\t" 782 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 783 "ulw %[qload3], 21(%[src]) \n\t" 784 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 785 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 786 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 787 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 788 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 789 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 790 791 /* odd 7. pixel */ 792 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 793 "mthi $zero, $ac1 \n\t" 794 "preceu.ph.qbr %[p5], %[qload3] \n\t" 795 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 796 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 797 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 798 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 799 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 800 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 801 802 /* odd 8. pixel */ 803 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 804 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 805 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 806 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 807 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 808 809 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 810 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 811 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 812 813 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 814 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 815 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 816 817 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3), 818 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 819 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 820 [p5] "=&r" (p5), 821 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 822 : [filter12] "r" (filter12), [filter34] "r" (filter34), 823 [filter56] "r" (filter56), [filter78] "r" (filter78), 824 [vector_64] "r" (vector_64), 825 [cm] "r" (cm), [dst] "r" (dst), 826 [src] "r" (src) 827 ); 828 829 src += 16; 830 dst += 16; 831 } 832 833 /* Next row... */ 834 src_ptr += src_stride; 835 dst_ptr += dst_stride; 836 } 837} 838 839void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 840 uint8_t *dst, ptrdiff_t dst_stride, 841 const int16_t *filter_x, int x_step_q4, 842 const int16_t *filter_y, int y_step_q4, 843 int w, int h) { 844 assert(x_step_q4 == 16); 845 assert(((const int32_t *)filter_x)[1] != 0x800000); 846 847 if (((const int32_t *)filter_x)[0] == 0) { 848 vpx_convolve2_horiz_dspr2(src, src_stride, 849 dst, dst_stride, 850 filter_x, x_step_q4, 851 filter_y, y_step_q4, 852 w, h); 853 } else { 854 uint32_t pos = 38; 855 856 prefetch_load((const uint8_t *)filter_x); 857 src -= 3; 858 859 /* bit positon for extract from acc */ 860 __asm__ __volatile__ ( 861 "wrdsp %[pos], 1 \n\t" 862 : 863 : [pos] "r" (pos) 864 ); 865 866 /* prefetch data to cache memory */ 867 prefetch_load(src); 868 prefetch_load(src + 32); 869 prefetch_store(dst); 870 871 switch (w) { 872 case 4: 873 convolve_horiz_4_dspr2(src, (int32_t)src_stride, 874 dst, (int32_t)dst_stride, 875 filter_x, (int32_t)h); 876 break; 877 case 8: 878 convolve_horiz_8_dspr2(src, (int32_t)src_stride, 879 dst, (int32_t)dst_stride, 880 filter_x, (int32_t)h); 881 break; 882 case 16: 883 convolve_horiz_16_dspr2(src, (int32_t)src_stride, 884 dst, (int32_t)dst_stride, 885 filter_x, (int32_t)h, 1); 886 break; 887 case 32: 888 convolve_horiz_16_dspr2(src, (int32_t)src_stride, 889 dst, (int32_t)dst_stride, 890 filter_x, (int32_t)h, 2); 891 break; 892 case 64: 893 prefetch_load(src + 64); 894 prefetch_store(dst + 32); 895 896 convolve_horiz_64_dspr2(src, (int32_t)src_stride, 897 dst, (int32_t)dst_stride, 898 filter_x, (int32_t)h); 899 break; 900 default: 901 vpx_convolve8_horiz_c(src + 3, src_stride, 902 dst, dst_stride, 903 filter_x, x_step_q4, 904 filter_y, y_step_q4, 905 w, h); 906 break; 907 } 908 } 909} 910#endif 911