1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_dsp_common.h" 17#include "vpx_dsp/vpx_filter.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_horiz_4_transposed_dspr2(const uint8_t *src, 22 int32_t src_stride, uint8_t *dst, 23 int32_t dst_stride, 24 const int16_t *filter_x0, 25 int32_t h) { 26 int32_t y; 27 uint8_t *cm = vpx_ff_cropTbl; 28 uint8_t *dst_ptr; 29 int32_t vector1b, vector2b, vector3b, vector4b; 30 int32_t Temp1, Temp2, Temp3, Temp4; 31 uint32_t vector4a = 64; 32 uint32_t tp1, tp2; 33 uint32_t p1, p2, p3, p4; 34 uint32_t tn1, tn2; 35 36 vector1b = ((const int32_t *)filter_x0)[0]; 37 vector2b = ((const int32_t *)filter_x0)[1]; 38 vector3b = ((const int32_t *)filter_x0)[2]; 39 vector4b = ((const int32_t *)filter_x0)[3]; 40 41 for (y = h; y--;) { 42 dst_ptr = dst; 43 /* prefetch data to cache memory */ 44 prefetch_load(src + src_stride); 45 prefetch_load(src + src_stride + 32); 46 47 __asm__ __volatile__( 48 "ulw %[tp1], 0(%[src]) \n\t" 49 "ulw %[tp2], 4(%[src]) \n\t" 50 51 /* even 1. pixel */ 52 "mtlo %[vector4a], $ac3 \n\t" 53 "mthi $zero, $ac3 \n\t" 54 "preceu.ph.qbr %[p1], %[tp1] \n\t" 55 "preceu.ph.qbl %[p2], %[tp1] \n\t" 56 "preceu.ph.qbr %[p3], %[tp2] \n\t" 57 "preceu.ph.qbl %[p4], %[tp2] \n\t" 58 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 59 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 60 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 61 "ulw %[tn2], 8(%[src]) \n\t" 62 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 63 "extp %[Temp1], $ac3, 31 \n\t" 64 65 /* even 2. pixel */ 66 "mtlo %[vector4a], $ac2 \n\t" 67 "mthi $zero, $ac2 \n\t" 68 "preceu.ph.qbr %[p1], %[tn2] \n\t" 69 "balign %[tn1], %[tn2], 3 \n\t" 70 "balign %[tn2], %[tp2], 3 \n\t" 71 "balign %[tp2], %[tp1], 3 \n\t" 72 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 73 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 74 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 75 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 76 "extp %[Temp3], $ac2, 31 \n\t" 77 78 /* odd 1. pixel */ 79 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 80 "mtlo %[vector4a], $ac3 \n\t" 81 "mthi $zero, $ac3 \n\t" 82 "preceu.ph.qbr %[p1], %[tp2] \n\t" 83 "preceu.ph.qbl %[p2], %[tp2] \n\t" 84 "preceu.ph.qbr %[p3], %[tn2] \n\t" 85 "preceu.ph.qbl %[p4], %[tn2] \n\t" 86 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 87 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 88 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 89 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 90 "extp %[Temp2], $ac3, 31 \n\t" 91 92 /* odd 2. pixel */ 93 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 94 "mtlo %[vector4a], $ac2 \n\t" 95 "mthi $zero, $ac2 \n\t" 96 "preceu.ph.qbr %[p1], %[tn1] \n\t" 97 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 98 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 99 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 100 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 101 "extp %[Temp4], $ac2, 31 \n\t" 102 103 /* clamp */ 104 "lbux %[tn1], %[Temp2](%[cm]) \n\t" 105 "lbux %[p2], %[Temp4](%[cm]) \n\t" 106 107 /* store bytes */ 108 "sb %[tp1], 0(%[dst_ptr]) \n\t" 109 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 110 111 "sb %[tn1], 0(%[dst_ptr]) \n\t" 112 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 113 114 "sb %[tp2], 0(%[dst_ptr]) \n\t" 115 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 116 117 "sb %[p2], 0(%[dst_ptr]) \n\t" 118 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t" 119 120 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 121 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 122 [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 123 [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr) 124 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 125 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 126 [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), 127 [dst_stride] "r"(dst_stride)); 128 129 /* Next row... */ 130 src += src_stride; 131 dst += 1; 132 } 133} 134 135static void convolve_horiz_8_transposed_dspr2(const uint8_t *src, 136 int32_t src_stride, uint8_t *dst, 137 int32_t dst_stride, 138 const int16_t *filter_x0, 139 int32_t h) { 140 int32_t y; 141 uint8_t *cm = vpx_ff_cropTbl; 142 uint8_t *dst_ptr; 143 uint32_t vector4a = 64; 144 int32_t vector1b, vector2b, vector3b, vector4b; 145 int32_t Temp1, Temp2, Temp3; 146 uint32_t tp1, tp2, tp3; 147 uint32_t p1, p2, p3, p4, n1; 148 uint8_t *odd_dst; 149 uint32_t dst_pitch_2 = (dst_stride << 1); 150 151 vector1b = ((const int32_t *)filter_x0)[0]; 152 vector2b = ((const int32_t *)filter_x0)[1]; 153 vector3b = ((const int32_t *)filter_x0)[2]; 154 vector4b = ((const int32_t *)filter_x0)[3]; 155 156 for (y = h; y--;) { 157 /* prefetch data to cache memory */ 158 prefetch_load(src + src_stride); 159 prefetch_load(src + src_stride + 32); 160 161 dst_ptr = dst; 162 odd_dst = (dst_ptr + dst_stride); 163 164 __asm__ __volatile__( 165 "ulw %[tp2], 0(%[src]) \n\t" 166 "ulw %[tp1], 4(%[src]) \n\t" 167 168 /* even 1. pixel */ 169 "mtlo %[vector4a], $ac3 \n\t" 170 "mthi $zero, $ac3 \n\t" 171 "mtlo %[vector4a], $ac2 \n\t" 172 "mthi $zero, $ac2 \n\t" 173 "preceu.ph.qbr %[p1], %[tp2] \n\t" 174 "preceu.ph.qbl %[p2], %[tp2] \n\t" 175 "preceu.ph.qbr %[p3], %[tp1] \n\t" 176 "preceu.ph.qbl %[p4], %[tp1] \n\t" 177 "ulw %[tp3], 8(%[src]) \n\t" 178 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 179 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 180 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 181 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 182 "extp %[Temp1], $ac3, 31 \n\t" 183 184 /* even 2. pixel */ 185 "preceu.ph.qbr %[p1], %[tp3] \n\t" 186 "preceu.ph.qbl %[n1], %[tp3] \n\t" 187 "ulw %[tp2], 12(%[src]) \n\t" 188 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 189 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 190 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 191 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 192 "extp %[Temp3], $ac2, 31 \n\t" 193 194 /* even 3. pixel */ 195 "lbux %[Temp2], %[Temp1](%[cm]) \n\t" 196 "mtlo %[vector4a], $ac1 \n\t" 197 "mthi $zero, $ac1 \n\t" 198 "preceu.ph.qbr %[p2], %[tp2] \n\t" 199 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 200 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 201 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 202 "lbux %[tp3], %[Temp3](%[cm]) \n\t" 203 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 204 "extp %[p3], $ac1, 31 \n\t" 205 206 /* even 4. pixel */ 207 "mtlo %[vector4a], $ac2 \n\t" 208 "mthi $zero, $ac2 \n\t" 209 "mtlo %[vector4a], $ac3 \n\t" 210 "mthi $zero, $ac3 \n\t" 211 "sb %[Temp2], 0(%[dst_ptr]) \n\t" 212 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 213 "sb %[tp3], 0(%[dst_ptr]) \n\t" 214 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 215 216 "ulw %[tp1], 1(%[src]) \n\t" 217 "ulw %[tp3], 5(%[src]) \n\t" 218 219 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 220 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 221 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 222 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 223 "extp %[Temp3], $ac2, 31 \n\t" 224 225 "lbux %[tp2], %[p3](%[cm]) \n\t" 226 227 /* odd 1. pixel */ 228 "mtlo %[vector4a], $ac1 \n\t" 229 "mthi $zero, $ac1 \n\t" 230 "preceu.ph.qbr %[p1], %[tp1] \n\t" 231 "preceu.ph.qbl %[p2], %[tp1] \n\t" 232 "preceu.ph.qbr %[p3], %[tp3] \n\t" 233 "preceu.ph.qbl %[p4], %[tp3] \n\t" 234 "sb %[tp2], 0(%[dst_ptr]) \n\t" 235 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 236 "ulw %[tp2], 9(%[src]) \n\t" 237 238 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 239 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 240 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 241 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 242 "extp %[Temp2], $ac3, 31 \n\t" 243 244 /* odd 2. pixel */ 245 "lbux %[tp1], %[Temp3](%[cm]) \n\t" 246 "mtlo %[vector4a], $ac3 \n\t" 247 "mthi $zero, $ac3 \n\t" 248 "mtlo %[vector4a], $ac2 \n\t" 249 "mthi $zero, $ac2 \n\t" 250 "preceu.ph.qbr %[p1], %[tp2] \n\t" 251 "preceu.ph.qbl %[n1], %[tp2] \n\t" 252 "ulw %[Temp1], 13(%[src]) \n\t" 253 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 254 "sb %[tp1], 0(%[dst_ptr]) \n\t" 255 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t" 256 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 257 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 258 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 259 "extp %[Temp3], $ac1, 31 \n\t" 260 261 /* odd 3. pixel */ 262 "lbux %[tp3], %[Temp2](%[cm]) \n\t" 263 "preceu.ph.qbr %[p2], %[Temp1] \n\t" 264 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 265 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 266 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 267 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 268 "extp %[Temp2], $ac3, 31 \n\t" 269 270 /* odd 4. pixel */ 271 "sb %[tp3], 0(%[odd_dst]) \n\t" 272 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 273 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 274 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 275 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 276 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 277 "extp %[Temp1], $ac2, 31 \n\t" 278 279 /* clamp */ 280 "lbux %[p4], %[Temp3](%[cm]) \n\t" 281 "lbux %[p2], %[Temp2](%[cm]) \n\t" 282 "lbux %[n1], %[Temp1](%[cm]) \n\t" 283 284 /* store bytes */ 285 "sb %[p4], 0(%[odd_dst]) \n\t" 286 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 287 288 "sb %[p2], 0(%[odd_dst]) \n\t" 289 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t" 290 291 "sb %[n1], 0(%[odd_dst]) \n\t" 292 293 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1), 294 [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), 295 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 296 [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst) 297 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 298 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 299 [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src), 300 [dst_pitch_2] "r"(dst_pitch_2)); 301 302 /* Next row... */ 303 src += src_stride; 304 dst += 1; 305 } 306} 307 308static void convolve_horiz_16_transposed_dspr2( 309 const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, 310 int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) { 311 int32_t c, y; 312 const uint8_t *src; 313 uint8_t *dst; 314 uint8_t *cm = vpx_ff_cropTbl; 315 uint32_t vector_64 = 64; 316 int32_t filter12, filter34, filter56, filter78; 317 int32_t Temp1, Temp2, Temp3; 318 uint32_t qload1, qload2; 319 uint32_t p1, p2, p3, p4, p5; 320 uint32_t st1, st2, st3; 321 uint32_t dst_pitch_2 = (dst_stride << 1); 322 uint8_t *odd_dst; 323 324 filter12 = ((const int32_t *)filter_x0)[0]; 325 filter34 = ((const int32_t *)filter_x0)[1]; 326 filter56 = ((const int32_t *)filter_x0)[2]; 327 filter78 = ((const int32_t *)filter_x0)[3]; 328 329 for (y = h; y--;) { 330 /* prefetch data to cache memory */ 331 prefetch_load(src_ptr + src_stride); 332 prefetch_load(src_ptr + src_stride + 32); 333 334 src = src_ptr; 335 dst = dst_ptr; 336 337 odd_dst = (dst + dst_stride); 338 339 for (c = 0; c < count; c++) { 340 __asm__ __volatile__( 341 "ulw %[qload1], 0(%[src]) " 342 "\n\t" 343 "ulw %[qload2], 4(%[src]) " 344 "\n\t" 345 346 /* even 1. pixel */ 347 "mtlo %[vector_64], $ac1 " 348 "\n\t" /* even 1 */ 349 "mthi $zero, $ac1 " 350 "\n\t" 351 "mtlo %[vector_64], $ac2 " 352 "\n\t" /* even 2 */ 353 "mthi $zero, $ac2 " 354 "\n\t" 355 "preceu.ph.qbr %[p3], %[qload2] " 356 "\n\t" 357 "preceu.ph.qbl %[p4], %[qload2] " 358 "\n\t" 359 "preceu.ph.qbr %[p1], %[qload1] " 360 "\n\t" 361 "preceu.ph.qbl %[p2], %[qload1] " 362 "\n\t" 363 "ulw %[qload2], 8(%[src]) " 364 "\n\t" 365 "dpa.w.ph $ac1, %[p1], %[filter12] " 366 "\n\t" /* even 1 */ 367 "dpa.w.ph $ac1, %[p2], %[filter34] " 368 "\n\t" /* even 1 */ 369 "dpa.w.ph $ac1, %[p3], %[filter56] " 370 "\n\t" /* even 1 */ 371 "dpa.w.ph $ac1, %[p4], %[filter78] " 372 "\n\t" /* even 1 */ 373 "extp %[Temp1], $ac1, 31 " 374 "\n\t" /* even 1 */ 375 376 /* even 2. pixel */ 377 "mtlo %[vector_64], $ac3 " 378 "\n\t" /* even 3 */ 379 "mthi $zero, $ac3 " 380 "\n\t" 381 "preceu.ph.qbr %[p1], %[qload2] " 382 "\n\t" 383 "preceu.ph.qbl %[p5], %[qload2] " 384 "\n\t" 385 "ulw %[qload1], 12(%[src]) " 386 "\n\t" 387 "dpa.w.ph $ac2, %[p2], %[filter12] " 388 "\n\t" /* even 1 */ 389 "dpa.w.ph $ac2, %[p3], %[filter34] " 390 "\n\t" /* even 1 */ 391 "dpa.w.ph $ac2, %[p4], %[filter56] " 392 "\n\t" /* even 1 */ 393 "dpa.w.ph $ac2, %[p1], %[filter78] " 394 "\n\t" /* even 1 */ 395 "lbux %[st1], %[Temp1](%[cm]) " 396 "\n\t" /* even 1 */ 397 "extp %[Temp2], $ac2, 31 " 398 "\n\t" /* even 1 */ 399 400 /* even 3. pixel */ 401 "mtlo %[vector_64], $ac1 " 402 "\n\t" /* even 4 */ 403 "mthi $zero, $ac1 " 404 "\n\t" 405 "preceu.ph.qbr %[p2], %[qload1] " 406 "\n\t" 407 "sb %[st1], 0(%[dst]) " 408 "\n\t" /* even 1 */ 409 "addu %[dst], %[dst], %[dst_pitch_2] " 410 " \n\t" 411 "dpa.w.ph $ac3, %[p3], %[filter12] " 412 "\n\t" /* even 3 */ 413 "dpa.w.ph $ac3, %[p4], %[filter34] " 414 "\n\t" /* even 3 */ 415 "dpa.w.ph $ac3, %[p1], %[filter56] " 416 "\n\t" /* even 3 */ 417 "dpa.w.ph $ac3, %[p5], %[filter78] " 418 "\n\t" /* even 3 */ 419 "extp %[Temp3], $ac3, 31 " 420 "\n\t" /* even 3 */ 421 "lbux %[st2], %[Temp2](%[cm]) " 422 "\n\t" /* even 1 */ 423 424 /* even 4. pixel */ 425 "mtlo %[vector_64], $ac2 " 426 "\n\t" /* even 5 */ 427 "mthi $zero, $ac2 " 428 "\n\t" 429 "preceu.ph.qbl %[p3], %[qload1] " 430 "\n\t" 431 "sb %[st2], 0(%[dst]) " 432 "\n\t" /* even 2 */ 433 "addu %[dst], %[dst], %[dst_pitch_2] " 434 "\n\t" 435 "ulw %[qload2], 16(%[src]) " 436 "\n\t" 437 "dpa.w.ph $ac1, %[p4], %[filter12] " 438 "\n\t" /* even 4 */ 439 "dpa.w.ph $ac1, %[p1], %[filter34] " 440 "\n\t" /* even 4 */ 441 "dpa.w.ph $ac1, %[p5], %[filter56] " 442 "\n\t" /* even 4 */ 443 "dpa.w.ph $ac1, %[p2], %[filter78] " 444 "\n\t" /* even 4 */ 445 "extp %[Temp1], $ac1, 31 " 446 "\n\t" /* even 4 */ 447 "lbux %[st3], %[Temp3](%[cm]) " 448 "\n\t" /* even 3 */ 449 450 /* even 5. pixel */ 451 "mtlo %[vector_64], $ac3 " 452 "\n\t" /* even 6 */ 453 "mthi $zero, $ac3 " 454 "\n\t" 455 "preceu.ph.qbr %[p4], %[qload2] " 456 "\n\t" 457 "sb %[st3], 0(%[dst]) " 458 "\n\t" /* even 3 */ 459 "addu %[dst], %[dst], %[dst_pitch_2] " 460 "\n\t" 461 "dpa.w.ph $ac2, %[p1], %[filter12] " 462 "\n\t" /* even 5 */ 463 "dpa.w.ph $ac2, %[p5], %[filter34] " 464 "\n\t" /* even 5 */ 465 "dpa.w.ph $ac2, %[p2], %[filter56] " 466 "\n\t" /* even 5 */ 467 "dpa.w.ph $ac2, %[p3], %[filter78] " 468 "\n\t" /* even 5 */ 469 "extp %[Temp2], $ac2, 31 " 470 "\n\t" /* even 5 */ 471 "lbux %[st1], %[Temp1](%[cm]) " 472 "\n\t" /* even 4 */ 473 474 /* even 6. pixel */ 475 "mtlo %[vector_64], $ac1 " 476 "\n\t" /* even 7 */ 477 "mthi $zero, $ac1 " 478 "\n\t" 479 "preceu.ph.qbl %[p1], %[qload2] " 480 "\n\t" 481 "sb %[st1], 0(%[dst]) " 482 "\n\t" /* even 4 */ 483 "addu %[dst], %[dst], %[dst_pitch_2] " 484 "\n\t" 485 "ulw %[qload1], 20(%[src]) " 486 "\n\t" 487 "dpa.w.ph $ac3, %[p5], %[filter12] " 488 "\n\t" /* even 6 */ 489 "dpa.w.ph $ac3, %[p2], %[filter34] " 490 "\n\t" /* even 6 */ 491 "dpa.w.ph $ac3, %[p3], %[filter56] " 492 "\n\t" /* even 6 */ 493 "dpa.w.ph $ac3, %[p4], %[filter78] " 494 "\n\t" /* even 6 */ 495 "extp %[Temp3], $ac3, 31 " 496 "\n\t" /* even 6 */ 497 "lbux %[st2], %[Temp2](%[cm]) " 498 "\n\t" /* even 5 */ 499 500 /* even 7. pixel */ 501 "mtlo %[vector_64], $ac2 " 502 "\n\t" /* even 8 */ 503 "mthi $zero, $ac2 " 504 "\n\t" 505 "preceu.ph.qbr %[p5], %[qload1] " 506 "\n\t" 507 "sb %[st2], 0(%[dst]) " 508 "\n\t" /* even 5 */ 509 "addu %[dst], %[dst], %[dst_pitch_2] " 510 "\n\t" 511 "dpa.w.ph $ac1, %[p2], %[filter12] " 512 "\n\t" /* even 7 */ 513 "dpa.w.ph $ac1, %[p3], %[filter34] " 514 "\n\t" /* even 7 */ 515 "dpa.w.ph $ac1, %[p4], %[filter56] " 516 "\n\t" /* even 7 */ 517 "dpa.w.ph $ac1, %[p1], %[filter78] " 518 "\n\t" /* even 7 */ 519 "extp %[Temp1], $ac1, 31 " 520 "\n\t" /* even 7 */ 521 "lbux %[st3], %[Temp3](%[cm]) " 522 "\n\t" /* even 6 */ 523 524 /* even 8. pixel */ 525 "mtlo %[vector_64], $ac3 " 526 "\n\t" /* odd 1 */ 527 "mthi $zero, $ac3 " 528 "\n\t" 529 "dpa.w.ph $ac2, %[p3], %[filter12] " 530 "\n\t" /* even 8 */ 531 "dpa.w.ph $ac2, %[p4], %[filter34] " 532 "\n\t" /* even 8 */ 533 "sb %[st3], 0(%[dst]) " 534 "\n\t" /* even 6 */ 535 "addu %[dst], %[dst], %[dst_pitch_2] " 536 "\n\t" 537 "dpa.w.ph $ac2, %[p1], %[filter56] " 538 "\n\t" /* even 8 */ 539 "dpa.w.ph $ac2, %[p5], %[filter78] " 540 "\n\t" /* even 8 */ 541 "extp %[Temp2], $ac2, 31 " 542 "\n\t" /* even 8 */ 543 "lbux %[st1], %[Temp1](%[cm]) " 544 "\n\t" /* even 7 */ 545 546 /* ODD pixels */ 547 "ulw %[qload1], 1(%[src]) " 548 "\n\t" 549 "ulw %[qload2], 5(%[src]) " 550 "\n\t" 551 552 /* odd 1. pixel */ 553 "mtlo %[vector_64], $ac1 " 554 "\n\t" /* odd 2 */ 555 "mthi $zero, $ac1 " 556 "\n\t" 557 "preceu.ph.qbr %[p1], %[qload1] " 558 "\n\t" 559 "preceu.ph.qbl %[p2], %[qload1] " 560 "\n\t" 561 "preceu.ph.qbr %[p3], %[qload2] " 562 "\n\t" 563 "preceu.ph.qbl %[p4], %[qload2] " 564 "\n\t" 565 "sb %[st1], 0(%[dst]) " 566 "\n\t" /* even 7 */ 567 "addu %[dst], %[dst], %[dst_pitch_2] " 568 "\n\t" 569 "ulw %[qload2], 9(%[src]) " 570 "\n\t" 571 "dpa.w.ph $ac3, %[p1], %[filter12] " 572 "\n\t" /* odd 1 */ 573 "dpa.w.ph $ac3, %[p2], %[filter34] " 574 "\n\t" /* odd 1 */ 575 "dpa.w.ph $ac3, %[p3], %[filter56] " 576 "\n\t" /* odd 1 */ 577 "dpa.w.ph $ac3, %[p4], %[filter78] " 578 "\n\t" /* odd 1 */ 579 "extp %[Temp3], $ac3, 31 " 580 "\n\t" /* odd 1 */ 581 "lbux %[st2], %[Temp2](%[cm]) " 582 "\n\t" /* even 8 */ 583 584 /* odd 2. pixel */ 585 "mtlo %[vector_64], $ac2 " 586 "\n\t" /* odd 3 */ 587 "mthi $zero, $ac2 " 588 "\n\t" 589 "preceu.ph.qbr %[p1], %[qload2] " 590 "\n\t" 591 "preceu.ph.qbl %[p5], %[qload2] " 592 "\n\t" 593 "sb %[st2], 0(%[dst]) " 594 "\n\t" /* even 8 */ 595 "ulw %[qload1], 13(%[src]) " 596 "\n\t" 597 "dpa.w.ph $ac1, %[p2], %[filter12] " 598 "\n\t" /* odd 2 */ 599 "dpa.w.ph $ac1, %[p3], %[filter34] " 600 "\n\t" /* odd 2 */ 601 "dpa.w.ph $ac1, %[p4], %[filter56] " 602 "\n\t" /* odd 2 */ 603 "dpa.w.ph $ac1, %[p1], %[filter78] " 604 "\n\t" /* odd 2 */ 605 "extp %[Temp1], $ac1, 31 " 606 "\n\t" /* odd 2 */ 607 "lbux %[st3], %[Temp3](%[cm]) " 608 "\n\t" /* odd 1 */ 609 610 /* odd 3. pixel */ 611 "mtlo %[vector_64], $ac3 " 612 "\n\t" /* odd 4 */ 613 "mthi $zero, $ac3 " 614 "\n\t" 615 "preceu.ph.qbr %[p2], %[qload1] " 616 "\n\t" 617 "sb %[st3], 0(%[odd_dst]) " 618 "\n\t" /* odd 1 */ 619 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 620 "\n\t" 621 "dpa.w.ph $ac2, %[p3], %[filter12] " 622 "\n\t" /* odd 3 */ 623 "dpa.w.ph $ac2, %[p4], %[filter34] " 624 "\n\t" /* odd 3 */ 625 "dpa.w.ph $ac2, %[p1], %[filter56] " 626 "\n\t" /* odd 3 */ 627 "dpa.w.ph $ac2, %[p5], %[filter78] " 628 "\n\t" /* odd 3 */ 629 "extp %[Temp2], $ac2, 31 " 630 "\n\t" /* odd 3 */ 631 "lbux %[st1], %[Temp1](%[cm]) " 632 "\n\t" /* odd 2 */ 633 634 /* odd 4. pixel */ 635 "mtlo %[vector_64], $ac1 " 636 "\n\t" /* odd 5 */ 637 "mthi $zero, $ac1 " 638 "\n\t" 639 "preceu.ph.qbl %[p3], %[qload1] " 640 "\n\t" 641 "sb %[st1], 0(%[odd_dst]) " 642 "\n\t" /* odd 2 */ 643 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 644 "\n\t" 645 "ulw %[qload2], 17(%[src]) " 646 "\n\t" 647 "dpa.w.ph $ac3, %[p4], %[filter12] " 648 "\n\t" /* odd 4 */ 649 "dpa.w.ph $ac3, %[p1], %[filter34] " 650 "\n\t" /* odd 4 */ 651 "dpa.w.ph $ac3, %[p5], %[filter56] " 652 "\n\t" /* odd 4 */ 653 "dpa.w.ph $ac3, %[p2], %[filter78] " 654 "\n\t" /* odd 4 */ 655 "extp %[Temp3], $ac3, 31 " 656 "\n\t" /* odd 4 */ 657 "lbux %[st2], %[Temp2](%[cm]) " 658 "\n\t" /* odd 3 */ 659 660 /* odd 5. pixel */ 661 "mtlo %[vector_64], $ac2 " 662 "\n\t" /* odd 6 */ 663 "mthi $zero, $ac2 " 664 "\n\t" 665 "preceu.ph.qbr %[p4], %[qload2] " 666 "\n\t" 667 "sb %[st2], 0(%[odd_dst]) " 668 "\n\t" /* odd 3 */ 669 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 670 "\n\t" 671 "dpa.w.ph $ac1, %[p1], %[filter12] " 672 "\n\t" /* odd 5 */ 673 "dpa.w.ph $ac1, %[p5], %[filter34] " 674 "\n\t" /* odd 5 */ 675 "dpa.w.ph $ac1, %[p2], %[filter56] " 676 "\n\t" /* odd 5 */ 677 "dpa.w.ph $ac1, %[p3], %[filter78] " 678 "\n\t" /* odd 5 */ 679 "extp %[Temp1], $ac1, 31 " 680 "\n\t" /* odd 5 */ 681 "lbux %[st3], %[Temp3](%[cm]) " 682 "\n\t" /* odd 4 */ 683 684 /* odd 6. pixel */ 685 "mtlo %[vector_64], $ac3 " 686 "\n\t" /* odd 7 */ 687 "mthi $zero, $ac3 " 688 "\n\t" 689 "preceu.ph.qbl %[p1], %[qload2] " 690 "\n\t" 691 "sb %[st3], 0(%[odd_dst]) " 692 "\n\t" /* odd 4 */ 693 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 694 "\n\t" 695 "ulw %[qload1], 21(%[src]) " 696 "\n\t" 697 "dpa.w.ph $ac2, %[p5], %[filter12] " 698 "\n\t" /* odd 6 */ 699 "dpa.w.ph $ac2, %[p2], %[filter34] " 700 "\n\t" /* odd 6 */ 701 "dpa.w.ph $ac2, %[p3], %[filter56] " 702 "\n\t" /* odd 6 */ 703 "dpa.w.ph $ac2, %[p4], %[filter78] " 704 "\n\t" /* odd 6 */ 705 "extp %[Temp2], $ac2, 31 " 706 "\n\t" /* odd 6 */ 707 "lbux %[st1], %[Temp1](%[cm]) " 708 "\n\t" /* odd 5 */ 709 710 /* odd 7. pixel */ 711 "mtlo %[vector_64], $ac1 " 712 "\n\t" /* odd 8 */ 713 "mthi $zero, $ac1 " 714 "\n\t" 715 "preceu.ph.qbr %[p5], %[qload1] " 716 "\n\t" 717 "sb %[st1], 0(%[odd_dst]) " 718 "\n\t" /* odd 5 */ 719 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 720 "\n\t" 721 "dpa.w.ph $ac3, %[p2], %[filter12] " 722 "\n\t" /* odd 7 */ 723 "dpa.w.ph $ac3, %[p3], %[filter34] " 724 "\n\t" /* odd 7 */ 725 "dpa.w.ph $ac3, %[p4], %[filter56] " 726 "\n\t" /* odd 7 */ 727 "dpa.w.ph $ac3, %[p1], %[filter78] " 728 "\n\t" /* odd 7 */ 729 "extp %[Temp3], $ac3, 31 " 730 "\n\t" /* odd 7 */ 731 732 /* odd 8. pixel */ 733 "dpa.w.ph $ac1, %[p3], %[filter12] " 734 "\n\t" /* odd 8 */ 735 "dpa.w.ph $ac1, %[p4], %[filter34] " 736 "\n\t" /* odd 8 */ 737 "dpa.w.ph $ac1, %[p1], %[filter56] " 738 "\n\t" /* odd 8 */ 739 "dpa.w.ph $ac1, %[p5], %[filter78] " 740 "\n\t" /* odd 8 */ 741 "extp %[Temp1], $ac1, 31 " 742 "\n\t" /* odd 8 */ 743 744 "lbux %[st2], %[Temp2](%[cm]) " 745 "\n\t" /* odd 6 */ 746 "lbux %[st3], %[Temp3](%[cm]) " 747 "\n\t" /* odd 7 */ 748 "lbux %[st1], %[Temp1](%[cm]) " 749 "\n\t" /* odd 8 */ 750 751 "sb %[st2], 0(%[odd_dst]) " 752 "\n\t" /* odd 6 */ 753 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 754 "\n\t" 755 756 "sb %[st3], 0(%[odd_dst]) " 757 "\n\t" /* odd 7 */ 758 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 759 "\n\t" 760 761 "sb %[st1], 0(%[odd_dst]) " 762 "\n\t" /* odd 8 */ 763 764 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), 765 [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), 766 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), 767 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 768 [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) 769 : [filter12] "r"(filter12), [filter34] "r"(filter34), 770 [filter56] "r"(filter56), [filter78] "r"(filter78), 771 [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), 772 [dst_pitch_2] "r"(dst_pitch_2)); 773 774 src += 16; 775 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); 776 odd_dst = (dst + dst_stride); 777 } 778 779 /* Next row... */ 780 src_ptr += src_stride; 781 782 dst_ptr += 1; 783 } 784} 785 786static void convolve_horiz_64_transposed_dspr2( 787 const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr, 788 int32_t dst_stride, const int16_t *filter_x0, int32_t h) { 789 int32_t c, y; 790 const uint8_t *src; 791 uint8_t *dst; 792 uint8_t *cm = vpx_ff_cropTbl; 793 uint32_t vector_64 = 64; 794 int32_t filter12, filter34, filter56, filter78; 795 int32_t Temp1, Temp2, Temp3; 796 uint32_t qload1, qload2; 797 uint32_t p1, p2, p3, p4, p5; 798 uint32_t st1, st2, st3; 799 uint32_t dst_pitch_2 = (dst_stride << 1); 800 uint8_t *odd_dst; 801 802 filter12 = ((const int32_t *)filter_x0)[0]; 803 filter34 = ((const int32_t *)filter_x0)[1]; 804 filter56 = ((const int32_t *)filter_x0)[2]; 805 filter78 = ((const int32_t *)filter_x0)[3]; 806 807 for (y = h; y--;) { 808 /* prefetch data to cache memory */ 809 prefetch_load(src_ptr + src_stride); 810 prefetch_load(src_ptr + src_stride + 32); 811 prefetch_load(src_ptr + src_stride + 64); 812 813 src = src_ptr; 814 dst = dst_ptr; 815 816 odd_dst = (dst + dst_stride); 817 818 for (c = 0; c < 4; c++) { 819 __asm__ __volatile__( 820 "ulw %[qload1], 0(%[src]) " 821 "\n\t" 822 "ulw %[qload2], 4(%[src]) " 823 "\n\t" 824 825 /* even 1. pixel */ 826 "mtlo %[vector_64], $ac1 " 827 "\n\t" /* even 1 */ 828 "mthi $zero, $ac1 " 829 "\n\t" 830 "mtlo %[vector_64], $ac2 " 831 "\n\t" /* even 2 */ 832 "mthi $zero, $ac2 " 833 "\n\t" 834 "preceu.ph.qbr %[p3], %[qload2] " 835 "\n\t" 836 "preceu.ph.qbl %[p4], %[qload2] " 837 "\n\t" 838 "preceu.ph.qbr %[p1], %[qload1] " 839 "\n\t" 840 "preceu.ph.qbl %[p2], %[qload1] " 841 "\n\t" 842 "ulw %[qload2], 8(%[src]) " 843 "\n\t" 844 "dpa.w.ph $ac1, %[p1], %[filter12] " 845 "\n\t" /* even 1 */ 846 "dpa.w.ph $ac1, %[p2], %[filter34] " 847 "\n\t" /* even 1 */ 848 "dpa.w.ph $ac1, %[p3], %[filter56] " 849 "\n\t" /* even 1 */ 850 "dpa.w.ph $ac1, %[p4], %[filter78] " 851 "\n\t" /* even 1 */ 852 "extp %[Temp1], $ac1, 31 " 853 "\n\t" /* even 1 */ 854 855 /* even 2. pixel */ 856 "mtlo %[vector_64], $ac3 " 857 "\n\t" /* even 3 */ 858 "mthi $zero, $ac3 " 859 "\n\t" 860 "preceu.ph.qbr %[p1], %[qload2] " 861 "\n\t" 862 "preceu.ph.qbl %[p5], %[qload2] " 863 "\n\t" 864 "ulw %[qload1], 12(%[src]) " 865 "\n\t" 866 "dpa.w.ph $ac2, %[p2], %[filter12] " 867 "\n\t" /* even 1 */ 868 "dpa.w.ph $ac2, %[p3], %[filter34] " 869 "\n\t" /* even 1 */ 870 "dpa.w.ph $ac2, %[p4], %[filter56] " 871 "\n\t" /* even 1 */ 872 "dpa.w.ph $ac2, %[p1], %[filter78] " 873 "\n\t" /* even 1 */ 874 "lbux %[st1], %[Temp1](%[cm]) " 875 "\n\t" /* even 1 */ 876 "extp %[Temp2], $ac2, 31 " 877 "\n\t" /* even 1 */ 878 879 /* even 3. pixel */ 880 "mtlo %[vector_64], $ac1 " 881 "\n\t" /* even 4 */ 882 "mthi $zero, $ac1 " 883 "\n\t" 884 "preceu.ph.qbr %[p2], %[qload1] " 885 "\n\t" 886 "sb %[st1], 0(%[dst]) " 887 "\n\t" /* even 1 */ 888 "addu %[dst], %[dst], %[dst_pitch_2] " 889 " \n\t" 890 "dpa.w.ph $ac3, %[p3], %[filter12] " 891 "\n\t" /* even 3 */ 892 "dpa.w.ph $ac3, %[p4], %[filter34] " 893 "\n\t" /* even 3 */ 894 "dpa.w.ph $ac3, %[p1], %[filter56] " 895 "\n\t" /* even 3 */ 896 "dpa.w.ph $ac3, %[p5], %[filter78] " 897 "\n\t" /* even 3 */ 898 "extp %[Temp3], $ac3, 31 " 899 "\n\t" /* even 3 */ 900 "lbux %[st2], %[Temp2](%[cm]) " 901 "\n\t" /* even 1 */ 902 903 /* even 4. pixel */ 904 "mtlo %[vector_64], $ac2 " 905 "\n\t" /* even 5 */ 906 "mthi $zero, $ac2 " 907 "\n\t" 908 "preceu.ph.qbl %[p3], %[qload1] " 909 "\n\t" 910 "sb %[st2], 0(%[dst]) " 911 "\n\t" /* even 2 */ 912 "addu %[dst], %[dst], %[dst_pitch_2] " 913 "\n\t" 914 "ulw %[qload2], 16(%[src]) " 915 "\n\t" 916 "dpa.w.ph $ac1, %[p4], %[filter12] " 917 "\n\t" /* even 4 */ 918 "dpa.w.ph $ac1, %[p1], %[filter34] " 919 "\n\t" /* even 4 */ 920 "dpa.w.ph $ac1, %[p5], %[filter56] " 921 "\n\t" /* even 4 */ 922 "dpa.w.ph $ac1, %[p2], %[filter78] " 923 "\n\t" /* even 4 */ 924 "extp %[Temp1], $ac1, 31 " 925 "\n\t" /* even 4 */ 926 "lbux %[st3], %[Temp3](%[cm]) " 927 "\n\t" /* even 3 */ 928 929 /* even 5. pixel */ 930 "mtlo %[vector_64], $ac3 " 931 "\n\t" /* even 6 */ 932 "mthi $zero, $ac3 " 933 "\n\t" 934 "preceu.ph.qbr %[p4], %[qload2] " 935 "\n\t" 936 "sb %[st3], 0(%[dst]) " 937 "\n\t" /* even 3 */ 938 "addu %[dst], %[dst], %[dst_pitch_2] " 939 "\n\t" 940 "dpa.w.ph $ac2, %[p1], %[filter12] " 941 "\n\t" /* even 5 */ 942 "dpa.w.ph $ac2, %[p5], %[filter34] " 943 "\n\t" /* even 5 */ 944 "dpa.w.ph $ac2, %[p2], %[filter56] " 945 "\n\t" /* even 5 */ 946 "dpa.w.ph $ac2, %[p3], %[filter78] " 947 "\n\t" /* even 5 */ 948 "extp %[Temp2], $ac2, 31 " 949 "\n\t" /* even 5 */ 950 "lbux %[st1], %[Temp1](%[cm]) " 951 "\n\t" /* even 4 */ 952 953 /* even 6. pixel */ 954 "mtlo %[vector_64], $ac1 " 955 "\n\t" /* even 7 */ 956 "mthi $zero, $ac1 " 957 "\n\t" 958 "preceu.ph.qbl %[p1], %[qload2] " 959 "\n\t" 960 "sb %[st1], 0(%[dst]) " 961 "\n\t" /* even 4 */ 962 "addu %[dst], %[dst], %[dst_pitch_2] " 963 "\n\t" 964 "ulw %[qload1], 20(%[src]) " 965 "\n\t" 966 "dpa.w.ph $ac3, %[p5], %[filter12] " 967 "\n\t" /* even 6 */ 968 "dpa.w.ph $ac3, %[p2], %[filter34] " 969 "\n\t" /* even 6 */ 970 "dpa.w.ph $ac3, %[p3], %[filter56] " 971 "\n\t" /* even 6 */ 972 "dpa.w.ph $ac3, %[p4], %[filter78] " 973 "\n\t" /* even 6 */ 974 "extp %[Temp3], $ac3, 31 " 975 "\n\t" /* even 6 */ 976 "lbux %[st2], %[Temp2](%[cm]) " 977 "\n\t" /* even 5 */ 978 979 /* even 7. pixel */ 980 "mtlo %[vector_64], $ac2 " 981 "\n\t" /* even 8 */ 982 "mthi $zero, $ac2 " 983 "\n\t" 984 "preceu.ph.qbr %[p5], %[qload1] " 985 "\n\t" 986 "sb %[st2], 0(%[dst]) " 987 "\n\t" /* even 5 */ 988 "addu %[dst], %[dst], %[dst_pitch_2] " 989 "\n\t" 990 "dpa.w.ph $ac1, %[p2], %[filter12] " 991 "\n\t" /* even 7 */ 992 "dpa.w.ph $ac1, %[p3], %[filter34] " 993 "\n\t" /* even 7 */ 994 "dpa.w.ph $ac1, %[p4], %[filter56] " 995 "\n\t" /* even 7 */ 996 "dpa.w.ph $ac1, %[p1], %[filter78] " 997 "\n\t" /* even 7 */ 998 "extp %[Temp1], $ac1, 31 " 999 "\n\t" /* even 7 */ 1000 "lbux %[st3], %[Temp3](%[cm]) " 1001 "\n\t" /* even 6 */ 1002 1003 /* even 8. pixel */ 1004 "mtlo %[vector_64], $ac3 " 1005 "\n\t" /* odd 1 */ 1006 "mthi $zero, $ac3 " 1007 "\n\t" 1008 "dpa.w.ph $ac2, %[p3], %[filter12] " 1009 "\n\t" /* even 8 */ 1010 "dpa.w.ph $ac2, %[p4], %[filter34] " 1011 "\n\t" /* even 8 */ 1012 "sb %[st3], 0(%[dst]) " 1013 "\n\t" /* even 6 */ 1014 "addu %[dst], %[dst], %[dst_pitch_2] " 1015 "\n\t" 1016 "dpa.w.ph $ac2, %[p1], %[filter56] " 1017 "\n\t" /* even 8 */ 1018 "dpa.w.ph $ac2, %[p5], %[filter78] " 1019 "\n\t" /* even 8 */ 1020 "extp %[Temp2], $ac2, 31 " 1021 "\n\t" /* even 8 */ 1022 "lbux %[st1], %[Temp1](%[cm]) " 1023 "\n\t" /* even 7 */ 1024 1025 /* ODD pixels */ 1026 "ulw %[qload1], 1(%[src]) " 1027 "\n\t" 1028 "ulw %[qload2], 5(%[src]) " 1029 "\n\t" 1030 1031 /* odd 1. pixel */ 1032 "mtlo %[vector_64], $ac1 " 1033 "\n\t" /* odd 2 */ 1034 "mthi $zero, $ac1 " 1035 "\n\t" 1036 "preceu.ph.qbr %[p1], %[qload1] " 1037 "\n\t" 1038 "preceu.ph.qbl %[p2], %[qload1] " 1039 "\n\t" 1040 "preceu.ph.qbr %[p3], %[qload2] " 1041 "\n\t" 1042 "preceu.ph.qbl %[p4], %[qload2] " 1043 "\n\t" 1044 "sb %[st1], 0(%[dst]) " 1045 "\n\t" /* even 7 */ 1046 "addu %[dst], %[dst], %[dst_pitch_2] " 1047 "\n\t" 1048 "ulw %[qload2], 9(%[src]) " 1049 "\n\t" 1050 "dpa.w.ph $ac3, %[p1], %[filter12] " 1051 "\n\t" /* odd 1 */ 1052 "dpa.w.ph $ac3, %[p2], %[filter34] " 1053 "\n\t" /* odd 1 */ 1054 "dpa.w.ph $ac3, %[p3], %[filter56] " 1055 "\n\t" /* odd 1 */ 1056 "dpa.w.ph $ac3, %[p4], %[filter78] " 1057 "\n\t" /* odd 1 */ 1058 "extp %[Temp3], $ac3, 31 " 1059 "\n\t" /* odd 1 */ 1060 "lbux %[st2], %[Temp2](%[cm]) " 1061 "\n\t" /* even 8 */ 1062 1063 /* odd 2. pixel */ 1064 "mtlo %[vector_64], $ac2 " 1065 "\n\t" /* odd 3 */ 1066 "mthi $zero, $ac2 " 1067 "\n\t" 1068 "preceu.ph.qbr %[p1], %[qload2] " 1069 "\n\t" 1070 "preceu.ph.qbl %[p5], %[qload2] " 1071 "\n\t" 1072 "sb %[st2], 0(%[dst]) " 1073 "\n\t" /* even 8 */ 1074 "ulw %[qload1], 13(%[src]) " 1075 "\n\t" 1076 "dpa.w.ph $ac1, %[p2], %[filter12] " 1077 "\n\t" /* odd 2 */ 1078 "dpa.w.ph $ac1, %[p3], %[filter34] " 1079 "\n\t" /* odd 2 */ 1080 "dpa.w.ph $ac1, %[p4], %[filter56] " 1081 "\n\t" /* odd 2 */ 1082 "dpa.w.ph $ac1, %[p1], %[filter78] " 1083 "\n\t" /* odd 2 */ 1084 "extp %[Temp1], $ac1, 31 " 1085 "\n\t" /* odd 2 */ 1086 "lbux %[st3], %[Temp3](%[cm]) " 1087 "\n\t" /* odd 1 */ 1088 1089 /* odd 3. pixel */ 1090 "mtlo %[vector_64], $ac3 " 1091 "\n\t" /* odd 4 */ 1092 "mthi $zero, $ac3 " 1093 "\n\t" 1094 "preceu.ph.qbr %[p2], %[qload1] " 1095 "\n\t" 1096 "sb %[st3], 0(%[odd_dst]) " 1097 "\n\t" /* odd 1 */ 1098 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 1099 "\n\t" 1100 "dpa.w.ph $ac2, %[p3], %[filter12] " 1101 "\n\t" /* odd 3 */ 1102 "dpa.w.ph $ac2, %[p4], %[filter34] " 1103 "\n\t" /* odd 3 */ 1104 "dpa.w.ph $ac2, %[p1], %[filter56] " 1105 "\n\t" /* odd 3 */ 1106 "dpa.w.ph $ac2, %[p5], %[filter78] " 1107 "\n\t" /* odd 3 */ 1108 "extp %[Temp2], $ac2, 31 " 1109 "\n\t" /* odd 3 */ 1110 "lbux %[st1], %[Temp1](%[cm]) " 1111 "\n\t" /* odd 2 */ 1112 1113 /* odd 4. pixel */ 1114 "mtlo %[vector_64], $ac1 " 1115 "\n\t" /* odd 5 */ 1116 "mthi $zero, $ac1 " 1117 "\n\t" 1118 "preceu.ph.qbl %[p3], %[qload1] " 1119 "\n\t" 1120 "sb %[st1], 0(%[odd_dst]) " 1121 "\n\t" /* odd 2 */ 1122 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 1123 "\n\t" 1124 "ulw %[qload2], 17(%[src]) " 1125 "\n\t" 1126 "dpa.w.ph $ac3, %[p4], %[filter12] " 1127 "\n\t" /* odd 4 */ 1128 "dpa.w.ph $ac3, %[p1], %[filter34] " 1129 "\n\t" /* odd 4 */ 1130 "dpa.w.ph $ac3, %[p5], %[filter56] " 1131 "\n\t" /* odd 4 */ 1132 "dpa.w.ph $ac3, %[p2], %[filter78] " 1133 "\n\t" /* odd 4 */ 1134 "extp %[Temp3], $ac3, 31 " 1135 "\n\t" /* odd 4 */ 1136 "lbux %[st2], %[Temp2](%[cm]) " 1137 "\n\t" /* odd 3 */ 1138 1139 /* odd 5. pixel */ 1140 "mtlo %[vector_64], $ac2 " 1141 "\n\t" /* odd 6 */ 1142 "mthi $zero, $ac2 " 1143 "\n\t" 1144 "preceu.ph.qbr %[p4], %[qload2] " 1145 "\n\t" 1146 "sb %[st2], 0(%[odd_dst]) " 1147 "\n\t" /* odd 3 */ 1148 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 1149 "\n\t" 1150 "dpa.w.ph $ac1, %[p1], %[filter12] " 1151 "\n\t" /* odd 5 */ 1152 "dpa.w.ph $ac1, %[p5], %[filter34] " 1153 "\n\t" /* odd 5 */ 1154 "dpa.w.ph $ac1, %[p2], %[filter56] " 1155 "\n\t" /* odd 5 */ 1156 "dpa.w.ph $ac1, %[p3], %[filter78] " 1157 "\n\t" /* odd 5 */ 1158 "extp %[Temp1], $ac1, 31 " 1159 "\n\t" /* odd 5 */ 1160 "lbux %[st3], %[Temp3](%[cm]) " 1161 "\n\t" /* odd 4 */ 1162 1163 /* odd 6. pixel */ 1164 "mtlo %[vector_64], $ac3 " 1165 "\n\t" /* odd 7 */ 1166 "mthi $zero, $ac3 " 1167 "\n\t" 1168 "preceu.ph.qbl %[p1], %[qload2] " 1169 "\n\t" 1170 "sb %[st3], 0(%[odd_dst]) " 1171 "\n\t" /* odd 4 */ 1172 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 1173 "\n\t" 1174 "ulw %[qload1], 21(%[src]) " 1175 "\n\t" 1176 "dpa.w.ph $ac2, %[p5], %[filter12] " 1177 "\n\t" /* odd 6 */ 1178 "dpa.w.ph $ac2, %[p2], %[filter34] " 1179 "\n\t" /* odd 6 */ 1180 "dpa.w.ph $ac2, %[p3], %[filter56] " 1181 "\n\t" /* odd 6 */ 1182 "dpa.w.ph $ac2, %[p4], %[filter78] " 1183 "\n\t" /* odd 6 */ 1184 "extp %[Temp2], $ac2, 31 " 1185 "\n\t" /* odd 6 */ 1186 "lbux %[st1], %[Temp1](%[cm]) " 1187 "\n\t" /* odd 5 */ 1188 1189 /* odd 7. pixel */ 1190 "mtlo %[vector_64], $ac1 " 1191 "\n\t" /* odd 8 */ 1192 "mthi $zero, $ac1 " 1193 "\n\t" 1194 "preceu.ph.qbr %[p5], %[qload1] " 1195 "\n\t" 1196 "sb %[st1], 0(%[odd_dst]) " 1197 "\n\t" /* odd 5 */ 1198 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 1199 "\n\t" 1200 "dpa.w.ph $ac3, %[p2], %[filter12] " 1201 "\n\t" /* odd 7 */ 1202 "dpa.w.ph $ac3, %[p3], %[filter34] " 1203 "\n\t" /* odd 7 */ 1204 "dpa.w.ph $ac3, %[p4], %[filter56] " 1205 "\n\t" /* odd 7 */ 1206 "dpa.w.ph $ac3, %[p1], %[filter78] " 1207 "\n\t" /* odd 7 */ 1208 "extp %[Temp3], $ac3, 31 " 1209 "\n\t" /* odd 7 */ 1210 1211 /* odd 8. pixel */ 1212 "dpa.w.ph $ac1, %[p3], %[filter12] " 1213 "\n\t" /* odd 8 */ 1214 "dpa.w.ph $ac1, %[p4], %[filter34] " 1215 "\n\t" /* odd 8 */ 1216 "dpa.w.ph $ac1, %[p1], %[filter56] " 1217 "\n\t" /* odd 8 */ 1218 "dpa.w.ph $ac1, %[p5], %[filter78] " 1219 "\n\t" /* odd 8 */ 1220 "extp %[Temp1], $ac1, 31 " 1221 "\n\t" /* odd 8 */ 1222 1223 "lbux %[st2], %[Temp2](%[cm]) " 1224 "\n\t" /* odd 6 */ 1225 "lbux %[st3], %[Temp3](%[cm]) " 1226 "\n\t" /* odd 7 */ 1227 "lbux %[st1], %[Temp1](%[cm]) " 1228 "\n\t" /* odd 8 */ 1229 1230 "sb %[st2], 0(%[odd_dst]) " 1231 "\n\t" /* odd 6 */ 1232 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 1233 "\n\t" 1234 1235 "sb %[st3], 0(%[odd_dst]) " 1236 "\n\t" /* odd 7 */ 1237 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] " 1238 "\n\t" 1239 1240 "sb %[st1], 0(%[odd_dst]) " 1241 "\n\t" /* odd 8 */ 1242 1243 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5), 1244 [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3), 1245 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), 1246 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 1247 [dst] "+r"(dst), [odd_dst] "+r"(odd_dst) 1248 : [filter12] "r"(filter12), [filter34] "r"(filter34), 1249 [filter56] "r"(filter56), [filter78] "r"(filter78), 1250 [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src), 1251 [dst_pitch_2] "r"(dst_pitch_2)); 1252 1253 src += 16; 1254 dst = (dst_ptr + ((c + 1) * 16 * dst_stride)); 1255 odd_dst = (dst + dst_stride); 1256 } 1257 1258 /* Next row... */ 1259 src_ptr += src_stride; 1260 1261 dst_ptr += 1; 1262 } 1263} 1264 1265void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, 1266 uint8_t *dst, ptrdiff_t dst_stride, 1267 const int16_t *filter, int w, int h) { 1268 int x, y, k; 1269 1270 for (y = 0; y < h; ++y) { 1271 for (x = 0; x < w; ++x) { 1272 int sum = 0; 1273 1274 for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k]; 1275 1276 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); 1277 } 1278 1279 src += src_stride; 1280 dst += 1; 1281 } 1282} 1283 1284void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, 1285 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) { 1286 int x, y; 1287 1288 for (y = 0; y < h; ++y) { 1289 for (x = 0; x < w; ++x) { 1290 dst[x * dst_stride] = src[x]; 1291 } 1292 1293 src += src_stride; 1294 dst += 1; 1295 } 1296} 1297 1298void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 1299 ptrdiff_t dst_stride, const InterpKernel *filter, 1300 int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4, 1301 int w, int h) { 1302 const int16_t *const filter_x = filter[x0_q4]; 1303 const int16_t *const filter_y = filter[y0_q4]; 1304 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); 1305 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; 1306 uint32_t pos = 38; 1307 1308 assert(x_step_q4 == 16); 1309 assert(y_step_q4 == 16); 1310 assert(((const int32_t *)filter_x)[1] != 0x800000); 1311 assert(((const int32_t *)filter_y)[1] != 0x800000); 1312 (void)x_step_q4; 1313 1314 /* bit positon for extract from acc */ 1315 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 1316 : 1317 : [pos] "r"(pos)); 1318 1319 if (intermediate_height < h) intermediate_height = h; 1320 1321 /* copy the src to dst */ 1322 if (filter_x[3] == 0x80) { 1323 copy_horiz_transposed(src - src_stride * 3, src_stride, temp, 1324 intermediate_height, w, intermediate_height); 1325 } else if (((const int32_t *)filter_x)[0] == 0) { 1326 vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp, 1327 intermediate_height, filter_x, w, intermediate_height); 1328 } else { 1329 src -= (src_stride * 3 + 3); 1330 1331 /* prefetch data to cache memory */ 1332 prefetch_load(src); 1333 prefetch_load(src + 32); 1334 1335 switch (w) { 1336 case 4: 1337 convolve_horiz_4_transposed_dspr2(src, src_stride, temp, 1338 intermediate_height, filter_x, 1339 intermediate_height); 1340 break; 1341 case 8: 1342 convolve_horiz_8_transposed_dspr2(src, src_stride, temp, 1343 intermediate_height, filter_x, 1344 intermediate_height); 1345 break; 1346 case 16: 1347 case 32: 1348 convolve_horiz_16_transposed_dspr2(src, src_stride, temp, 1349 intermediate_height, filter_x, 1350 intermediate_height, (w / 16)); 1351 break; 1352 case 64: 1353 prefetch_load(src + 32); 1354 convolve_horiz_64_transposed_dspr2(src, src_stride, temp, 1355 intermediate_height, filter_x, 1356 intermediate_height); 1357 break; 1358 default: 1359 convolve_horiz_transposed(src, src_stride, temp, intermediate_height, 1360 filter_x, w, intermediate_height); 1361 break; 1362 } 1363 } 1364 1365 /* copy the src to dst */ 1366 if (filter_y[3] == 0x80) { 1367 copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w); 1368 } else if (((const int32_t *)filter_y)[0] == 0) { 1369 vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride, 1370 filter_y, h, w); 1371 } else { 1372 switch (h) { 1373 case 4: 1374 convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst, 1375 dst_stride, filter_y, w); 1376 break; 1377 case 8: 1378 convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst, 1379 dst_stride, filter_y, w); 1380 break; 1381 case 16: 1382 case 32: 1383 convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst, 1384 dst_stride, filter_y, w, (h / 16)); 1385 break; 1386 case 64: 1387 convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst, 1388 dst_stride, filter_y, w); 1389 break; 1390 default: 1391 convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride, 1392 filter_y, h, w); 1393 break; 1394 } 1395 } 1396} 1397 1398void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, 1399 uint8_t *dst, ptrdiff_t dst_stride, 1400 const InterpKernel *filter, int x0_q4, 1401 int x_step_q4, int y0_q4, int y_step_q4, int w, 1402 int h) { 1403 int x, y; 1404 (void)filter; 1405 (void)x0_q4; 1406 (void)x_step_q4; 1407 (void)y0_q4; 1408 (void)y_step_q4; 1409 1410 /* prefetch data to cache memory */ 1411 prefetch_load(src); 1412 prefetch_load(src + 32); 1413 prefetch_store(dst); 1414 1415 switch (w) { 1416 case 4: { 1417 uint32_t tp1; 1418 1419 /* 1 word storage */ 1420 for (y = h; y--;) { 1421 prefetch_load(src + src_stride); 1422 prefetch_load(src + src_stride + 32); 1423 prefetch_store(dst + dst_stride); 1424 1425 __asm__ __volatile__( 1426 "ulw %[tp1], (%[src]) \n\t" 1427 "sw %[tp1], (%[dst]) \n\t" /* store */ 1428 1429 : [tp1] "=&r"(tp1) 1430 : [src] "r"(src), [dst] "r"(dst)); 1431 1432 src += src_stride; 1433 dst += dst_stride; 1434 } 1435 break; 1436 } 1437 case 8: { 1438 uint32_t tp1, tp2; 1439 1440 /* 2 word storage */ 1441 for (y = h; y--;) { 1442 prefetch_load(src + src_stride); 1443 prefetch_load(src + src_stride + 32); 1444 prefetch_store(dst + dst_stride); 1445 1446 __asm__ __volatile__( 1447 "ulw %[tp1], 0(%[src]) \n\t" 1448 "ulw %[tp2], 4(%[src]) \n\t" 1449 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 1450 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 1451 1452 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) 1453 : [src] "r"(src), [dst] "r"(dst)); 1454 1455 src += src_stride; 1456 dst += dst_stride; 1457 } 1458 break; 1459 } 1460 case 16: { 1461 uint32_t tp1, tp2, tp3, tp4; 1462 1463 /* 4 word storage */ 1464 for (y = h; y--;) { 1465 prefetch_load(src + src_stride); 1466 prefetch_load(src + src_stride + 32); 1467 prefetch_store(dst + dst_stride); 1468 1469 __asm__ __volatile__( 1470 "ulw %[tp1], 0(%[src]) \n\t" 1471 "ulw %[tp2], 4(%[src]) \n\t" 1472 "ulw %[tp3], 8(%[src]) \n\t" 1473 "ulw %[tp4], 12(%[src]) \n\t" 1474 1475 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 1476 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 1477 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 1478 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 1479 1480 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 1481 [tp4] "=&r"(tp4) 1482 : [src] "r"(src), [dst] "r"(dst)); 1483 1484 src += src_stride; 1485 dst += dst_stride; 1486 } 1487 break; 1488 } 1489 case 32: { 1490 uint32_t tp1, tp2, tp3, tp4; 1491 uint32_t tp5, tp6, tp7, tp8; 1492 1493 /* 8 word storage */ 1494 for (y = h; y--;) { 1495 prefetch_load(src + src_stride); 1496 prefetch_load(src + src_stride + 32); 1497 prefetch_store(dst + dst_stride); 1498 1499 __asm__ __volatile__( 1500 "ulw %[tp1], 0(%[src]) \n\t" 1501 "ulw %[tp2], 4(%[src]) \n\t" 1502 "ulw %[tp3], 8(%[src]) \n\t" 1503 "ulw %[tp4], 12(%[src]) \n\t" 1504 "ulw %[tp5], 16(%[src]) \n\t" 1505 "ulw %[tp6], 20(%[src]) \n\t" 1506 "ulw %[tp7], 24(%[src]) \n\t" 1507 "ulw %[tp8], 28(%[src]) \n\t" 1508 1509 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 1510 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 1511 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 1512 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 1513 "sw %[tp5], 16(%[dst]) \n\t" /* store */ 1514 "sw %[tp6], 20(%[dst]) \n\t" /* store */ 1515 "sw %[tp7], 24(%[dst]) \n\t" /* store */ 1516 "sw %[tp8], 28(%[dst]) \n\t" /* store */ 1517 1518 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 1519 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), 1520 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) 1521 : [src] "r"(src), [dst] "r"(dst)); 1522 1523 src += src_stride; 1524 dst += dst_stride; 1525 } 1526 break; 1527 } 1528 case 64: { 1529 uint32_t tp1, tp2, tp3, tp4; 1530 uint32_t tp5, tp6, tp7, tp8; 1531 1532 prefetch_load(src + 64); 1533 prefetch_store(dst + 32); 1534 1535 /* 16 word storage */ 1536 for (y = h; y--;) { 1537 prefetch_load(src + src_stride); 1538 prefetch_load(src + src_stride + 32); 1539 prefetch_load(src + src_stride + 64); 1540 prefetch_store(dst + dst_stride); 1541 prefetch_store(dst + dst_stride + 32); 1542 1543 __asm__ __volatile__( 1544 "ulw %[tp1], 0(%[src]) \n\t" 1545 "ulw %[tp2], 4(%[src]) \n\t" 1546 "ulw %[tp3], 8(%[src]) \n\t" 1547 "ulw %[tp4], 12(%[src]) \n\t" 1548 "ulw %[tp5], 16(%[src]) \n\t" 1549 "ulw %[tp6], 20(%[src]) \n\t" 1550 "ulw %[tp7], 24(%[src]) \n\t" 1551 "ulw %[tp8], 28(%[src]) \n\t" 1552 1553 "sw %[tp1], 0(%[dst]) \n\t" /* store */ 1554 "sw %[tp2], 4(%[dst]) \n\t" /* store */ 1555 "sw %[tp3], 8(%[dst]) \n\t" /* store */ 1556 "sw %[tp4], 12(%[dst]) \n\t" /* store */ 1557 "sw %[tp5], 16(%[dst]) \n\t" /* store */ 1558 "sw %[tp6], 20(%[dst]) \n\t" /* store */ 1559 "sw %[tp7], 24(%[dst]) \n\t" /* store */ 1560 "sw %[tp8], 28(%[dst]) \n\t" /* store */ 1561 1562 "ulw %[tp1], 32(%[src]) \n\t" 1563 "ulw %[tp2], 36(%[src]) \n\t" 1564 "ulw %[tp3], 40(%[src]) \n\t" 1565 "ulw %[tp4], 44(%[src]) \n\t" 1566 "ulw %[tp5], 48(%[src]) \n\t" 1567 "ulw %[tp6], 52(%[src]) \n\t" 1568 "ulw %[tp7], 56(%[src]) \n\t" 1569 "ulw %[tp8], 60(%[src]) \n\t" 1570 1571 "sw %[tp1], 32(%[dst]) \n\t" /* store */ 1572 "sw %[tp2], 36(%[dst]) \n\t" /* store */ 1573 "sw %[tp3], 40(%[dst]) \n\t" /* store */ 1574 "sw %[tp4], 44(%[dst]) \n\t" /* store */ 1575 "sw %[tp5], 48(%[dst]) \n\t" /* store */ 1576 "sw %[tp6], 52(%[dst]) \n\t" /* store */ 1577 "sw %[tp7], 56(%[dst]) \n\t" /* store */ 1578 "sw %[tp8], 60(%[dst]) \n\t" /* store */ 1579 1580 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 1581 [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6), 1582 [tp7] "=&r"(tp7), [tp8] "=&r"(tp8) 1583 : [src] "r"(src), [dst] "r"(dst)); 1584 1585 src += src_stride; 1586 dst += dst_stride; 1587 } 1588 break; 1589 } 1590 default: 1591 for (y = h; y--;) { 1592 for (x = 0; x < w; ++x) { 1593 dst[x] = src[x]; 1594 } 1595 1596 src += src_stride; 1597 dst += dst_stride; 1598 } 1599 break; 1600 } 1601} 1602#endif 1603