1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_convolve.h" 17#include "vpx_dsp/vpx_dsp_common.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride, 22 uint8_t *dst, int32_t dst_stride, 23 const int16_t *filter_x0, int32_t h) { 24 int32_t y; 25 uint8_t *cm = vpx_ff_cropTbl; 26 int32_t Temp1, Temp2, Temp3, Temp4; 27 uint32_t vector4a = 64; 28 uint32_t tp1, tp2; 29 uint32_t p1, p2; 30 const int16_t *filter = &filter_x0[3]; 31 uint32_t filter45; 32 33 filter45 = ((const int32_t *)filter)[0]; 34 35 for (y = h; y--;) { 36 /* prefetch data to cache memory */ 37 prefetch_load(src + src_stride); 38 prefetch_load(src + src_stride + 32); 39 prefetch_store(dst + dst_stride); 40 41 __asm__ __volatile__( 42 "ulw %[tp1], 0(%[src]) \n\t" 43 "ulw %[tp2], 4(%[src]) \n\t" 44 45 /* even 1. pixel */ 46 "mtlo %[vector4a], $ac3 \n\t" 47 "mthi $zero, $ac3 \n\t" 48 "preceu.ph.qbr %[p1], %[tp1] \n\t" 49 "preceu.ph.qbl %[p2], %[tp1] \n\t" 50 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 51 "extp %[Temp1], $ac3, 31 \n\t" 52 53 /* even 2. pixel */ 54 "mtlo %[vector4a], $ac2 \n\t" 55 "mthi $zero, $ac2 \n\t" 56 "balign %[tp2], %[tp1], 3 \n\t" 57 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 58 "extp %[Temp3], $ac2, 31 \n\t" 59 60 /* odd 1. pixel */ 61 "lbux %[tp1], %[Temp1](%[cm]) \n\t" 62 "mtlo %[vector4a], $ac3 \n\t" 63 "mthi $zero, $ac3 \n\t" 64 "preceu.ph.qbr %[p1], %[tp2] \n\t" 65 "preceu.ph.qbl %[p2], %[tp2] \n\t" 66 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 67 "extp %[Temp2], $ac3, 31 \n\t" 68 69 /* odd 2. pixel */ 70 "lbux %[tp2], %[Temp3](%[cm]) \n\t" 71 "mtlo %[vector4a], $ac2 \n\t" 72 "mthi $zero, $ac2 \n\t" 73 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 74 "extp %[Temp4], $ac2, 31 \n\t" 75 76 /* clamp */ 77 "lbux %[p1], %[Temp2](%[cm]) \n\t" 78 "lbux %[p2], %[Temp4](%[cm]) \n\t" 79 80 /* store bytes */ 81 "sb %[tp1], 0(%[dst]) \n\t" 82 "sb %[p1], 1(%[dst]) \n\t" 83 "sb %[tp2], 2(%[dst]) \n\t" 84 "sb %[p2], 3(%[dst]) \n\t" 85 86 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2), 87 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 88 [Temp4] "=&r"(Temp4) 89 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), 90 [dst] "r"(dst), [src] "r"(src)); 91 92 /* Next row... */ 93 src += src_stride; 94 dst += dst_stride; 95 } 96} 97 98static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride, 99 uint8_t *dst, int32_t dst_stride, 100 const int16_t *filter_x0, int32_t h) { 101 int32_t y; 102 uint8_t *cm = vpx_ff_cropTbl; 103 uint32_t vector4a = 64; 104 int32_t Temp1, Temp2, Temp3; 105 uint32_t tp1, tp2, tp3; 106 uint32_t p1, p2, p3, p4; 107 uint32_t st0, st1; 108 const int16_t *filter = &filter_x0[3]; 109 uint32_t filter45; 110 111 filter45 = ((const int32_t *)filter)[0]; 112 113 for (y = h; y--;) { 114 /* prefetch data to cache memory */ 115 prefetch_load(src + src_stride); 116 prefetch_load(src + src_stride + 32); 117 prefetch_store(dst + dst_stride); 118 119 __asm__ __volatile__( 120 "ulw %[tp1], 0(%[src]) \n\t" 121 "ulw %[tp2], 4(%[src]) \n\t" 122 123 /* even 1. pixel */ 124 "mtlo %[vector4a], $ac3 \n\t" 125 "mthi $zero, $ac3 \n\t" 126 "mtlo %[vector4a], $ac2 \n\t" 127 "mthi $zero, $ac2 \n\t" 128 "preceu.ph.qbr %[p1], %[tp1] \n\t" 129 "preceu.ph.qbl %[p2], %[tp1] \n\t" 130 "preceu.ph.qbr %[p3], %[tp2] \n\t" 131 "preceu.ph.qbl %[p4], %[tp2] \n\t" 132 "ulw %[tp3], 8(%[src]) \n\t" 133 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 134 "extp %[Temp1], $ac3, 31 \n\t" 135 136 /* even 2. pixel */ 137 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 138 "extp %[Temp3], $ac2, 31 \n\t" 139 140 /* even 3. pixel */ 141 "lbux %[st0], %[Temp1](%[cm]) \n\t" 142 "mtlo %[vector4a], $ac1 \n\t" 143 "mthi $zero, $ac1 \n\t" 144 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" 145 "extp %[Temp1], $ac1, 31 \n\t" 146 147 /* even 4. pixel */ 148 "mtlo %[vector4a], $ac2 \n\t" 149 "mthi $zero, $ac2 \n\t" 150 "mtlo %[vector4a], $ac3 \n\t" 151 "mthi $zero, $ac3 \n\t" 152 "sb %[st0], 0(%[dst]) \n\t" 153 "lbux %[st1], %[Temp3](%[cm]) \n\t" 154 155 "balign %[tp3], %[tp2], 3 \n\t" 156 "balign %[tp2], %[tp1], 3 \n\t" 157 158 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 159 "extp %[Temp3], $ac2, 31 \n\t" 160 161 "lbux %[st0], %[Temp1](%[cm]) \n\t" 162 163 /* odd 1. pixel */ 164 "mtlo %[vector4a], $ac1 \n\t" 165 "mthi $zero, $ac1 \n\t" 166 "sb %[st1], 2(%[dst]) \n\t" 167 "preceu.ph.qbr %[p1], %[tp2] \n\t" 168 "preceu.ph.qbl %[p2], %[tp2] \n\t" 169 "preceu.ph.qbr %[p3], %[tp3] \n\t" 170 "preceu.ph.qbl %[p4], %[tp3] \n\t" 171 "sb %[st0], 4(%[dst]) \n\t" 172 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 173 "extp %[Temp2], $ac3, 31 \n\t" 174 175 /* odd 2. pixel */ 176 "mtlo %[vector4a], $ac3 \n\t" 177 "mthi $zero, $ac3 \n\t" 178 "mtlo %[vector4a], $ac2 \n\t" 179 "mthi $zero, $ac2 \n\t" 180 "lbux %[st0], %[Temp3](%[cm]) \n\t" 181 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" 182 "extp %[Temp3], $ac1, 31 \n\t" 183 184 /* odd 3. pixel */ 185 "lbux %[st1], %[Temp2](%[cm]) \n\t" 186 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" 187 "extp %[Temp2], $ac3, 31 \n\t" 188 189 /* odd 4. pixel */ 190 "sb %[st1], 1(%[dst]) \n\t" 191 "sb %[st0], 6(%[dst]) \n\t" 192 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 193 "extp %[Temp1], $ac2, 31 \n\t" 194 195 /* clamp */ 196 "lbux %[p4], %[Temp3](%[cm]) \n\t" 197 "lbux %[p2], %[Temp2](%[cm]) \n\t" 198 "lbux %[p1], %[Temp1](%[cm]) \n\t" 199 200 /* store bytes */ 201 "sb %[p4], 3(%[dst]) \n\t" 202 "sb %[p2], 5(%[dst]) \n\t" 203 "sb %[p1], 7(%[dst]) \n\t" 204 205 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 206 [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), 207 [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), 208 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 209 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), 210 [dst] "r"(dst), [src] "r"(src)); 211 212 /* Next row... */ 213 src += src_stride; 214 dst += dst_stride; 215 } 216} 217 218static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr, 219 int32_t src_stride, uint8_t *dst_ptr, 220 int32_t dst_stride, 221 const int16_t *filter_x0, int32_t h, 222 int32_t count) { 223 int32_t y, c; 224 const uint8_t *src; 225 uint8_t *dst; 226 uint8_t *cm = vpx_ff_cropTbl; 227 uint32_t vector_64 = 64; 228 int32_t Temp1, Temp2, Temp3; 229 uint32_t qload1, qload2, qload3; 230 uint32_t p1, p2, p3, p4, p5; 231 uint32_t st1, st2, st3; 232 const int16_t *filter = &filter_x0[3]; 233 uint32_t filter45; 234 235 filter45 = ((const int32_t *)filter)[0]; 236 237 for (y = h; y--;) { 238 src = src_ptr; 239 dst = dst_ptr; 240 241 /* prefetch data to cache memory */ 242 prefetch_load(src_ptr + src_stride); 243 prefetch_load(src_ptr + src_stride + 32); 244 prefetch_store(dst_ptr + dst_stride); 245 246 for (c = 0; c < count; c++) { 247 __asm__ __volatile__( 248 "ulw %[qload1], 0(%[src]) \n\t" 249 "ulw %[qload2], 4(%[src]) \n\t" 250 251 /* even 1. pixel */ 252 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 253 "mthi $zero, $ac1 \n\t" 254 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 255 "mthi $zero, $ac2 \n\t" 256 "preceu.ph.qbr %[p1], %[qload1] \n\t" 257 "preceu.ph.qbl %[p2], %[qload1] \n\t" 258 "preceu.ph.qbr %[p3], %[qload2] \n\t" 259 "preceu.ph.qbl %[p4], %[qload2] \n\t" 260 "ulw %[qload3], 8(%[src]) \n\t" 261 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 262 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 263 264 /* even 2. pixel */ 265 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 266 "mthi $zero, $ac3 \n\t" 267 "preceu.ph.qbr %[p1], %[qload3] \n\t" 268 "preceu.ph.qbl %[p5], %[qload3] \n\t" 269 "ulw %[qload1], 12(%[src]) \n\t" 270 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 271 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 272 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 273 274 /* even 3. pixel */ 275 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 276 "mthi $zero, $ac1 \n\t" 277 "preceu.ph.qbr %[p2], %[qload1] \n\t" 278 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 279 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 280 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 281 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 282 283 /* even 4. pixel */ 284 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 285 "mthi $zero, $ac2 \n\t" 286 "preceu.ph.qbl %[p3], %[qload1] \n\t" 287 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 288 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 289 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 290 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 291 292 /* even 5. pixel */ 293 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 294 "mthi $zero, $ac3 \n\t" 295 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 296 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 297 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 298 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 299 300 /* even 6. pixel */ 301 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 302 "mthi $zero, $ac1 \n\t" 303 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 304 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 305 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 306 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 307 308 /* even 7. pixel */ 309 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 310 "mthi $zero, $ac2 \n\t" 311 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 312 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 313 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 314 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 315 316 /* even 8. pixel */ 317 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 318 "mthi $zero, $ac3 \n\t" 319 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 320 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 321 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 322 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 323 324 /* ODD pixels */ 325 "ulw %[qload1], 1(%[src]) \n\t" 326 "ulw %[qload2], 5(%[src]) \n\t" 327 328 /* odd 1. pixel */ 329 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 330 "mthi $zero, $ac1 \n\t" 331 "preceu.ph.qbr %[p1], %[qload1] \n\t" 332 "preceu.ph.qbl %[p2], %[qload1] \n\t" 333 "preceu.ph.qbr %[p3], %[qload2] \n\t" 334 "preceu.ph.qbl %[p4], %[qload2] \n\t" 335 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 336 "ulw %[qload3], 9(%[src]) \n\t" 337 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 338 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 339 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 340 341 /* odd 2. pixel */ 342 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 343 "mthi $zero, $ac2 \n\t" 344 "preceu.ph.qbr %[p1], %[qload3] \n\t" 345 "preceu.ph.qbl %[p5], %[qload3] \n\t" 346 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 347 "ulw %[qload1], 13(%[src]) \n\t" 348 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 349 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 350 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 351 352 /* odd 3. pixel */ 353 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 354 "mthi $zero, $ac3 \n\t" 355 "preceu.ph.qbr %[p2], %[qload1] \n\t" 356 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 357 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 358 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 359 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 360 361 /* odd 4. pixel */ 362 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 363 "mthi $zero, $ac1 \n\t" 364 "preceu.ph.qbl %[p3], %[qload1] \n\t" 365 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 366 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 367 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 368 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 369 370 /* odd 5. pixel */ 371 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 372 "mthi $zero, $ac2 \n\t" 373 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 374 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 375 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 376 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 377 378 /* odd 6. pixel */ 379 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 380 "mthi $zero, $ac3 \n\t" 381 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 382 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 383 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 384 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 385 386 /* odd 7. pixel */ 387 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 388 "mthi $zero, $ac1 \n\t" 389 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 390 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 391 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 392 393 /* odd 8. pixel */ 394 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 395 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 396 397 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 398 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 399 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 400 401 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 402 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 403 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 404 405 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), 406 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), 407 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 408 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), 409 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 410 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), 411 [dst] "r"(dst), [src] "r"(src)); 412 413 src += 16; 414 dst += 16; 415 } 416 417 /* Next row... */ 418 src_ptr += src_stride; 419 dst_ptr += dst_stride; 420 } 421} 422 423static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, 424 int32_t src_stride, uint8_t *dst_ptr, 425 int32_t dst_stride, 426 const int16_t *filter_x0, int32_t h) { 427 int32_t y, c; 428 const uint8_t *src; 429 uint8_t *dst; 430 uint8_t *cm = vpx_ff_cropTbl; 431 uint32_t vector_64 = 64; 432 int32_t Temp1, Temp2, Temp3; 433 uint32_t qload1, qload2, qload3; 434 uint32_t p1, p2, p3, p4, p5; 435 uint32_t st1, st2, st3; 436 const int16_t *filter = &filter_x0[3]; 437 uint32_t filter45; 438 439 filter45 = ((const int32_t *)filter)[0]; 440 441 for (y = h; y--;) { 442 src = src_ptr; 443 dst = dst_ptr; 444 445 /* prefetch data to cache memory */ 446 prefetch_load(src_ptr + src_stride); 447 prefetch_load(src_ptr + src_stride + 32); 448 prefetch_load(src_ptr + src_stride + 64); 449 prefetch_store(dst_ptr + dst_stride); 450 prefetch_store(dst_ptr + dst_stride + 32); 451 452 for (c = 0; c < 4; c++) { 453 __asm__ __volatile__( 454 "ulw %[qload1], 0(%[src]) \n\t" 455 "ulw %[qload2], 4(%[src]) \n\t" 456 457 /* even 1. pixel */ 458 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 459 "mthi $zero, $ac1 \n\t" 460 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 461 "mthi $zero, $ac2 \n\t" 462 "preceu.ph.qbr %[p1], %[qload1] \n\t" 463 "preceu.ph.qbl %[p2], %[qload1] \n\t" 464 "preceu.ph.qbr %[p3], %[qload2] \n\t" 465 "preceu.ph.qbl %[p4], %[qload2] \n\t" 466 "ulw %[qload3], 8(%[src]) \n\t" 467 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 468 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 469 470 /* even 2. pixel */ 471 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 472 "mthi $zero, $ac3 \n\t" 473 "preceu.ph.qbr %[p1], %[qload3] \n\t" 474 "preceu.ph.qbl %[p5], %[qload3] \n\t" 475 "ulw %[qload1], 12(%[src]) \n\t" 476 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 477 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 478 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 479 480 /* even 3. pixel */ 481 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 482 "mthi $zero, $ac1 \n\t" 483 "preceu.ph.qbr %[p2], %[qload1] \n\t" 484 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */ 485 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 486 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 487 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 488 489 /* even 4. pixel */ 490 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 491 "mthi $zero, $ac2 \n\t" 492 "preceu.ph.qbl %[p3], %[qload1] \n\t" 493 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */ 494 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 495 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 496 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 497 498 /* even 5. pixel */ 499 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 500 "mthi $zero, $ac3 \n\t" 501 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */ 502 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 503 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 504 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 505 506 /* even 6. pixel */ 507 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 508 "mthi $zero, $ac1 \n\t" 509 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */ 510 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 511 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 512 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 513 514 /* even 7. pixel */ 515 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 516 "mthi $zero, $ac2 \n\t" 517 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */ 518 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 519 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 520 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 521 522 /* even 8. pixel */ 523 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 524 "mthi $zero, $ac3 \n\t" 525 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 526 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */ 527 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 528 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 529 530 /* ODD pixels */ 531 "ulw %[qload1], 1(%[src]) \n\t" 532 "ulw %[qload2], 5(%[src]) \n\t" 533 534 /* odd 1. pixel */ 535 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 536 "mthi $zero, $ac1 \n\t" 537 "preceu.ph.qbr %[p1], %[qload1] \n\t" 538 "preceu.ph.qbl %[p2], %[qload1] \n\t" 539 "preceu.ph.qbr %[p3], %[qload2] \n\t" 540 "preceu.ph.qbl %[p4], %[qload2] \n\t" 541 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */ 542 "ulw %[qload3], 9(%[src]) \n\t" 543 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 544 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 545 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 546 547 /* odd 2. pixel */ 548 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 549 "mthi $zero, $ac2 \n\t" 550 "preceu.ph.qbr %[p1], %[qload3] \n\t" 551 "preceu.ph.qbl %[p5], %[qload3] \n\t" 552 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */ 553 "ulw %[qload1], 13(%[src]) \n\t" 554 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 555 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 556 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 557 558 /* odd 3. pixel */ 559 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 560 "mthi $zero, $ac3 \n\t" 561 "preceu.ph.qbr %[p2], %[qload1] \n\t" 562 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */ 563 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 564 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 565 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 566 567 /* odd 4. pixel */ 568 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 569 "mthi $zero, $ac1 \n\t" 570 "preceu.ph.qbl %[p3], %[qload1] \n\t" 571 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */ 572 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 573 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 574 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 575 576 /* odd 5. pixel */ 577 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 578 "mthi $zero, $ac2 \n\t" 579 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */ 580 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 581 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 582 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 583 584 /* odd 6. pixel */ 585 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 586 "mthi $zero, $ac3 \n\t" 587 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */ 588 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 589 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 590 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 591 592 /* odd 7. pixel */ 593 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 594 "mthi $zero, $ac1 \n\t" 595 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */ 596 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 597 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 598 599 /* odd 8. pixel */ 600 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 601 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 602 603 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 604 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 605 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 606 607 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */ 608 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */ 609 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */ 610 611 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), 612 [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2), 613 [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 614 [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), 615 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 616 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), 617 [dst] "r"(dst), [src] "r"(src)); 618 619 src += 16; 620 dst += 16; 621 } 622 623 /* Next row... */ 624 src_ptr += src_stride; 625 dst_ptr += dst_stride; 626 } 627} 628 629void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 630 uint8_t *dst, ptrdiff_t dst_stride, 631 const InterpKernel *filter, int x0_q4, 632 int32_t x_step_q4, int y0_q4, int y_step_q4, 633 int w, int h) { 634 const int16_t *const filter_x = filter[x0_q4]; 635 uint32_t pos = 38; 636 637 assert(x_step_q4 == 16); 638 639 prefetch_load((const uint8_t *)filter_x); 640 641 /* bit positon for extract from acc */ 642 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 643 : 644 : [pos] "r"(pos)); 645 646 /* prefetch data to cache memory */ 647 prefetch_load(src); 648 prefetch_load(src + 32); 649 prefetch_store(dst); 650 651 switch (w) { 652 case 4: 653 convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst, 654 (int32_t)dst_stride, filter_x, (int32_t)h); 655 break; 656 case 8: 657 convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst, 658 (int32_t)dst_stride, filter_x, (int32_t)h); 659 break; 660 case 16: 661 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, 662 (int32_t)dst_stride, filter_x, (int32_t)h, 1); 663 break; 664 case 32: 665 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst, 666 (int32_t)dst_stride, filter_x, (int32_t)h, 2); 667 break; 668 case 64: 669 prefetch_load(src + 64); 670 prefetch_store(dst + 32); 671 672 convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst, 673 (int32_t)dst_stride, filter_x, (int32_t)h); 674 break; 675 default: 676 vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, 677 x_step_q4, y0_q4, y_step_q4, w, h); 678 break; 679 } 680} 681#endif 682