1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_convolve.h" 17#include "vpx_dsp/vpx_dsp_common.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, 22 int32_t src_stride, 23 uint8_t *dst, 24 int32_t dst_stride, 25 const int16_t *filter_x0, 26 int32_t h) { 27 int32_t y; 28 uint8_t *cm = vpx_ff_cropTbl; 29 int32_t Temp1, Temp2, Temp3, Temp4; 30 uint32_t vector4a = 64; 31 uint32_t tp1, tp2; 32 uint32_t p1, p2, p3; 33 uint32_t tn1, tn2; 34 const int16_t *filter = &filter_x0[3]; 35 uint32_t filter45; 36 37 filter45 = ((const int32_t *)filter)[0]; 38 39 for (y = h; y--;) { 40 /* prefetch data to cache memory */ 41 prefetch_load(src + src_stride); 42 prefetch_load(src + src_stride + 32); 43 prefetch_store(dst + dst_stride); 44 45 __asm__ __volatile__ ( 46 "ulw %[tp1], 0(%[src]) \n\t" 47 "ulw %[tp2], 4(%[src]) \n\t" 48 49 /* even 1. pixel */ 50 "mtlo %[vector4a], $ac3 \n\t" 51 "mthi $zero, $ac3 \n\t" 52 "preceu.ph.qbr %[p1], %[tp1] \n\t" 53 "preceu.ph.qbl %[p2], %[tp1] \n\t" 54 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 55 "extp %[Temp1], $ac3, 31 \n\t" 56 57 /* even 2. pixel */ 58 "mtlo %[vector4a], $ac2 \n\t" 59 "mthi $zero, $ac2 \n\t" 60 "balign %[tp2], %[tp1], 3 \n\t" 61 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 62 "extp %[Temp3], $ac2, 31 \n\t" 63 64 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ 65 66 /* odd 1. pixel */ 67 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ 68 "mtlo %[vector4a], $ac3 \n\t" 69 "mthi $zero, $ac3 \n\t" 70 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ 71 "preceu.ph.qbr %[p1], %[tp2] \n\t" 72 "preceu.ph.qbl %[p3], %[tp2] \n\t" 73 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 74 "extp %[Temp2], $ac3, 31 \n\t" 75 76 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ 77 78 /* odd 2. pixel */ 79 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ 80 "mtlo %[vector4a], $ac2 \n\t" 81 "mthi $zero, $ac2 \n\t" 82 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ 83 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ 84 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" 85 "extp %[Temp4], $ac2, 31 \n\t" 86 87 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ 88 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ 89 90 /* clamp */ 91 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ 92 "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ 93 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ 94 95 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ 96 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ 97 98 "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ 99 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ 100 101 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 102 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), 103 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), 104 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 105 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) 106 : [filter45] "r" (filter45), [vector4a] "r" (vector4a), 107 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 108 ); 109 110 /* Next row... */ 111 src += src_stride; 112 dst += dst_stride; 113 } 114} 115 116static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, 117 int32_t src_stride, 118 uint8_t *dst, 119 int32_t dst_stride, 120 const int16_t *filter_x0, 121 int32_t h) { 122 int32_t y; 123 uint8_t *cm = vpx_ff_cropTbl; 124 uint32_t vector4a = 64; 125 int32_t Temp1, Temp2, Temp3; 126 uint32_t tp1, tp2, tp3, tp4; 127 uint32_t p1, p2, p3, p4, n1; 128 uint32_t st0, st1; 129 const int16_t *filter = &filter_x0[3]; 130 uint32_t filter45;; 131 132 filter45 = ((const int32_t *)filter)[0]; 133 134 for (y = h; y--;) { 135 /* prefetch data to cache memory */ 136 prefetch_load(src + src_stride); 137 prefetch_load(src + src_stride + 32); 138 prefetch_store(dst + dst_stride); 139 140 __asm__ __volatile__ ( 141 "ulw %[tp1], 0(%[src]) \n\t" 142 "ulw %[tp2], 4(%[src]) \n\t" 143 144 /* even 1. pixel */ 145 "mtlo %[vector4a], $ac3 \n\t" 146 "mthi $zero, $ac3 \n\t" 147 "mtlo %[vector4a], $ac2 \n\t" 148 "mthi $zero, $ac2 \n\t" 149 "preceu.ph.qbr %[p1], %[tp1] \n\t" 150 "preceu.ph.qbl %[p2], %[tp1] \n\t" 151 "preceu.ph.qbr %[p3], %[tp2] \n\t" 152 "preceu.ph.qbl %[p4], %[tp2] \n\t" 153 "ulw %[tp3], 8(%[src]) \n\t" 154 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 155 "extp %[Temp1], $ac3, 31 \n\t" 156 "lbu %[Temp2], 0(%[dst]) \n\t" 157 "lbu %[tp4], 2(%[dst]) \n\t" 158 159 /* even 2. pixel */ 160 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 161 "extp %[Temp3], $ac2, 31 \n\t" 162 163 /* even 3. pixel */ 164 "lbux %[st0], %[Temp1](%[cm]) \n\t" 165 "mtlo %[vector4a], $ac1 \n\t" 166 "mthi $zero, $ac1 \n\t" 167 "lbux %[st1], %[Temp3](%[cm]) \n\t" 168 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" 169 "extp %[Temp1], $ac1, 31 \n\t" 170 171 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 172 "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" 173 "sb %[Temp2], 0(%[dst]) \n\t" 174 "sb %[tp4], 2(%[dst]) \n\t" 175 176 /* even 4. pixel */ 177 "mtlo %[vector4a], $ac2 \n\t" 178 "mthi $zero, $ac2 \n\t" 179 "mtlo %[vector4a], $ac3 \n\t" 180 "mthi $zero, $ac3 \n\t" 181 182 "balign %[tp3], %[tp2], 3 \n\t" 183 "balign %[tp2], %[tp1], 3 \n\t" 184 185 "lbux %[st0], %[Temp1](%[cm]) \n\t" 186 "lbu %[Temp2], 4(%[dst]) \n\t" 187 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 188 189 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 190 "extp %[Temp3], $ac2, 31 \n\t" 191 192 /* odd 1. pixel */ 193 "mtlo %[vector4a], $ac1 \n\t" 194 "mthi $zero, $ac1 \n\t" 195 "sb %[Temp2], 4(%[dst]) \n\t" 196 "preceu.ph.qbr %[p1], %[tp2] \n\t" 197 "preceu.ph.qbl %[p2], %[tp2] \n\t" 198 "preceu.ph.qbr %[p3], %[tp3] \n\t" 199 "preceu.ph.qbl %[p4], %[tp3] \n\t" 200 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 201 "extp %[Temp2], $ac3, 31 \n\t" 202 203 "lbu %[tp1], 6(%[dst]) \n\t" 204 205 /* odd 2. pixel */ 206 "mtlo %[vector4a], $ac3 \n\t" 207 "mthi $zero, $ac3 \n\t" 208 "mtlo %[vector4a], $ac2 \n\t" 209 "mthi $zero, $ac2 \n\t" 210 "lbux %[st0], %[Temp3](%[cm]) \n\t" 211 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" 212 "extp %[Temp3], $ac1, 31 \n\t" 213 214 "lbu %[tp2], 1(%[dst]) \n\t" 215 "lbu %[tp3], 3(%[dst]) \n\t" 216 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" 217 218 /* odd 3. pixel */ 219 "lbux %[st1], %[Temp2](%[cm]) \n\t" 220 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" 221 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" 222 "extp %[Temp2], $ac3, 31 \n\t" 223 224 "lbu %[tp4], 5(%[dst]) \n\t" 225 226 /* odd 4. pixel */ 227 "sb %[tp2], 1(%[dst]) \n\t" 228 "sb %[tp1], 6(%[dst]) \n\t" 229 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 230 "extp %[Temp1], $ac2, 31 \n\t" 231 232 "lbu %[tp1], 7(%[dst]) \n\t" 233 234 /* clamp */ 235 "lbux %[p4], %[Temp3](%[cm]) \n\t" 236 "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" 237 238 "lbux %[p2], %[Temp2](%[cm]) \n\t" 239 "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" 240 241 "lbux %[p1], %[Temp1](%[cm]) \n\t" 242 "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" 243 244 /* store bytes */ 245 "sb %[tp3], 3(%[dst]) \n\t" 246 "sb %[tp4], 5(%[dst]) \n\t" 247 "sb %[tp1], 7(%[dst]) \n\t" 248 249 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 250 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 251 [st0] "=&r" (st0), [st1] "=&r" (st1), 252 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 253 [n1] "=&r" (n1), 254 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 255 : [filter45] "r" (filter45), [vector4a] "r" (vector4a), 256 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 257 ); 258 259 /* Next row... */ 260 src += src_stride; 261 dst += dst_stride; 262 } 263} 264 265static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, 266 int32_t src_stride, 267 uint8_t *dst_ptr, 268 int32_t dst_stride, 269 const int16_t *filter_x0, 270 int32_t h, 271 int32_t count) { 272 int32_t y, c; 273 const uint8_t *src; 274 uint8_t *dst; 275 uint8_t *cm = vpx_ff_cropTbl; 276 uint32_t vector_64 = 64; 277 int32_t Temp1, Temp2, Temp3; 278 uint32_t qload1, qload2, qload3; 279 uint32_t p1, p2, p3, p4, p5; 280 uint32_t st1, st2, st3; 281 const int16_t *filter = &filter_x0[3]; 282 uint32_t filter45;; 283 284 filter45 = ((const int32_t *)filter)[0]; 285 286 for (y = h; y--;) { 287 src = src_ptr; 288 dst = dst_ptr; 289 290 /* prefetch data to cache memory */ 291 prefetch_load(src_ptr + src_stride); 292 prefetch_load(src_ptr + src_stride + 32); 293 prefetch_store(dst_ptr + dst_stride); 294 295 for (c = 0; c < count; c++) { 296 __asm__ __volatile__ ( 297 "ulw %[qload1], 0(%[src]) \n\t" 298 "ulw %[qload2], 4(%[src]) \n\t" 299 300 /* even 1. pixel */ 301 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 302 "mthi $zero, $ac1 \n\t" 303 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 304 "mthi $zero, $ac2 \n\t" 305 "preceu.ph.qbr %[p1], %[qload1] \n\t" 306 "preceu.ph.qbl %[p2], %[qload1] \n\t" 307 "preceu.ph.qbr %[p3], %[qload2] \n\t" 308 "preceu.ph.qbl %[p4], %[qload2] \n\t" 309 "ulw %[qload3], 8(%[src]) \n\t" 310 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 311 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 312 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 313 314 /* even 2. pixel */ 315 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 316 "mthi $zero, $ac3 \n\t" 317 "preceu.ph.qbr %[p1], %[qload3] \n\t" 318 "preceu.ph.qbl %[p5], %[qload3] \n\t" 319 "ulw %[qload1], 12(%[src]) \n\t" 320 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 321 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 322 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 323 324 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 325 326 /* even 3. pixel */ 327 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 328 "mthi $zero, $ac1 \n\t" 329 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 330 "preceu.ph.qbr %[p2], %[qload1] \n\t" 331 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 332 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 333 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 334 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 335 336 /* even 4. pixel */ 337 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 338 "mthi $zero, $ac2 \n\t" 339 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 340 "preceu.ph.qbl %[p3], %[qload1] \n\t" 341 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 342 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 343 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 344 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 345 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 346 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 347 348 /* even 5. pixel */ 349 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 350 "mthi $zero, $ac3 \n\t" 351 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 352 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 353 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 354 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 355 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 356 357 /* even 6. pixel */ 358 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 359 "mthi $zero, $ac1 \n\t" 360 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 361 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 362 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 363 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 364 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 365 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 366 367 /* even 7. pixel */ 368 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 369 "mthi $zero, $ac2 \n\t" 370 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 371 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 372 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 373 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 374 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 375 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 376 377 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 378 379 /* even 8. pixel */ 380 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 381 "mthi $zero, $ac3 \n\t" 382 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 383 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 384 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 385 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 386 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 387 388 /* ODD pixels */ 389 "ulw %[qload1], 1(%[src]) \n\t" 390 "ulw %[qload2], 5(%[src]) \n\t" 391 392 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 393 394 /* odd 1. pixel */ 395 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 396 "mthi $zero, $ac1 \n\t" 397 "preceu.ph.qbr %[p1], %[qload1] \n\t" 398 "preceu.ph.qbl %[p2], %[qload1] \n\t" 399 "preceu.ph.qbr %[p3], %[qload2] \n\t" 400 "preceu.ph.qbl %[p4], %[qload2] \n\t" 401 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 402 "ulw %[qload3], 9(%[src]) \n\t" 403 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 404 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 405 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 406 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 407 408 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 409 410 /* odd 2. pixel */ 411 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 412 "mthi $zero, $ac2 \n\t" 413 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 414 "preceu.ph.qbr %[p1], %[qload3] \n\t" 415 "preceu.ph.qbl %[p5], %[qload3] \n\t" 416 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 417 "ulw %[qload1], 13(%[src]) \n\t" 418 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 419 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 420 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 421 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 422 423 /* odd 3. pixel */ 424 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 425 "mthi $zero, $ac3 \n\t" 426 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 427 "preceu.ph.qbr %[p2], %[qload1] \n\t" 428 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 429 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 430 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 431 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 432 433 /* odd 4. pixel */ 434 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 435 "mthi $zero, $ac1 \n\t" 436 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 437 "preceu.ph.qbl %[p3], %[qload1] \n\t" 438 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 439 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 440 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 441 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 442 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 443 444 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 445 446 /* odd 5. pixel */ 447 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 448 "mthi $zero, $ac2 \n\t" 449 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 450 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 451 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 452 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 453 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 454 455 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 456 457 /* odd 6. pixel */ 458 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 459 "mthi $zero, $ac3 \n\t" 460 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 461 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 462 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 463 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 464 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 465 466 /* odd 7. pixel */ 467 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 468 "mthi $zero, $ac1 \n\t" 469 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 470 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 471 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 472 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 473 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 474 475 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 476 477 /* odd 8. pixel */ 478 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 479 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 480 481 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 482 483 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 484 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 485 486 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 487 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 488 489 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 490 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 491 492 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 493 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 494 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 495 496 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), 497 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 498 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 499 [qload3] "=&r" (qload3), [p5] "=&r" (p5), 500 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 501 : [filter45] "r" (filter45), [vector_64] "r" (vector_64), 502 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 503 ); 504 505 src += 16; 506 dst += 16; 507 } 508 509 /* Next row... */ 510 src_ptr += src_stride; 511 dst_ptr += dst_stride; 512 } 513} 514 515static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, 516 int32_t src_stride, 517 uint8_t *dst_ptr, 518 int32_t dst_stride, 519 const int16_t *filter_x0, 520 int32_t h) { 521 int32_t y, c; 522 const uint8_t *src; 523 uint8_t *dst; 524 uint8_t *cm = vpx_ff_cropTbl; 525 uint32_t vector_64 = 64; 526 int32_t Temp1, Temp2, Temp3; 527 uint32_t qload1, qload2, qload3; 528 uint32_t p1, p2, p3, p4, p5; 529 uint32_t st1, st2, st3; 530 const int16_t *filter = &filter_x0[3]; 531 uint32_t filter45;; 532 533 filter45 = ((const int32_t *)filter)[0]; 534 535 for (y = h; y--;) { 536 src = src_ptr; 537 dst = dst_ptr; 538 539 /* prefetch data to cache memory */ 540 prefetch_load(src_ptr + src_stride); 541 prefetch_load(src_ptr + src_stride + 32); 542 prefetch_load(src_ptr + src_stride + 64); 543 prefetch_store(dst_ptr + dst_stride); 544 prefetch_store(dst_ptr + dst_stride + 32); 545 546 for (c = 0; c < 4; c++) { 547 __asm__ __volatile__ ( 548 "ulw %[qload1], 0(%[src]) \n\t" 549 "ulw %[qload2], 4(%[src]) \n\t" 550 551 /* even 1. pixel */ 552 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 553 "mthi $zero, $ac1 \n\t" 554 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 555 "mthi $zero, $ac2 \n\t" 556 "preceu.ph.qbr %[p1], %[qload1] \n\t" 557 "preceu.ph.qbl %[p2], %[qload1] \n\t" 558 "preceu.ph.qbr %[p3], %[qload2] \n\t" 559 "preceu.ph.qbl %[p4], %[qload2] \n\t" 560 "ulw %[qload3], 8(%[src]) \n\t" 561 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 562 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 563 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 564 565 /* even 2. pixel */ 566 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 567 "mthi $zero, $ac3 \n\t" 568 "preceu.ph.qbr %[p1], %[qload3] \n\t" 569 "preceu.ph.qbl %[p5], %[qload3] \n\t" 570 "ulw %[qload1], 12(%[src]) \n\t" 571 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 572 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 573 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 574 575 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 576 577 /* even 3. pixel */ 578 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 579 "mthi $zero, $ac1 \n\t" 580 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 581 "preceu.ph.qbr %[p2], %[qload1] \n\t" 582 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 583 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 584 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 585 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 586 587 /* even 4. pixel */ 588 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 589 "mthi $zero, $ac2 \n\t" 590 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 591 "preceu.ph.qbl %[p3], %[qload1] \n\t" 592 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 593 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 594 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 595 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 596 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 597 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 598 599 /* even 5. pixel */ 600 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 601 "mthi $zero, $ac3 \n\t" 602 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 603 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 604 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 605 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 606 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 607 608 /* even 6. pixel */ 609 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 610 "mthi $zero, $ac1 \n\t" 611 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 612 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 613 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 614 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 615 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 616 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 617 618 /* even 7. pixel */ 619 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 620 "mthi $zero, $ac2 \n\t" 621 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 622 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 623 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 624 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 625 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 626 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 627 628 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 629 630 /* even 8. pixel */ 631 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 632 "mthi $zero, $ac3 \n\t" 633 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 634 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 635 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 636 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 637 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 638 639 /* ODD pixels */ 640 "ulw %[qload1], 1(%[src]) \n\t" 641 "ulw %[qload2], 5(%[src]) \n\t" 642 643 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 644 645 /* odd 1. pixel */ 646 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 647 "mthi $zero, $ac1 \n\t" 648 "preceu.ph.qbr %[p1], %[qload1] \n\t" 649 "preceu.ph.qbl %[p2], %[qload1] \n\t" 650 "preceu.ph.qbr %[p3], %[qload2] \n\t" 651 "preceu.ph.qbl %[p4], %[qload2] \n\t" 652 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 653 "ulw %[qload3], 9(%[src]) \n\t" 654 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 655 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 656 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 657 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 658 659 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 660 661 /* odd 2. pixel */ 662 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 663 "mthi $zero, $ac2 \n\t" 664 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 665 "preceu.ph.qbr %[p1], %[qload3] \n\t" 666 "preceu.ph.qbl %[p5], %[qload3] \n\t" 667 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 668 "ulw %[qload1], 13(%[src]) \n\t" 669 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 670 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 671 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 672 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 673 674 /* odd 3. pixel */ 675 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 676 "mthi $zero, $ac3 \n\t" 677 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 678 "preceu.ph.qbr %[p2], %[qload1] \n\t" 679 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 680 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 681 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 682 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 683 684 /* odd 4. pixel */ 685 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 686 "mthi $zero, $ac1 \n\t" 687 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 688 "preceu.ph.qbl %[p3], %[qload1] \n\t" 689 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 690 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 691 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 692 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 693 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 694 695 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 696 697 /* odd 5. pixel */ 698 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 699 "mthi $zero, $ac2 \n\t" 700 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 701 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 702 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 703 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 704 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 705 706 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 707 708 /* odd 6. pixel */ 709 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 710 "mthi $zero, $ac3 \n\t" 711 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 712 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 713 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 714 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 715 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 716 717 /* odd 7. pixel */ 718 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 719 "mthi $zero, $ac1 \n\t" 720 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 721 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 722 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 723 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 724 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 725 726 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 727 728 /* odd 8. pixel */ 729 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 730 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 731 732 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 733 734 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 735 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 736 737 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 738 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 739 740 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 741 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 742 743 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 744 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 745 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 746 747 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), 748 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 749 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 750 [qload3] "=&r" (qload3), [p5] "=&r" (p5), 751 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 752 : [filter45] "r" (filter45), [vector_64] "r" (vector_64), 753 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 754 ); 755 756 src += 16; 757 dst += 16; 758 } 759 760 /* Next row... */ 761 src_ptr += src_stride; 762 dst_ptr += dst_stride; 763 } 764} 765 766void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 767 uint8_t *dst, ptrdiff_t dst_stride, 768 const int16_t *filter_x, int x_step_q4, 769 const int16_t *filter_y, int y_step_q4, 770 int w, int h) { 771 uint32_t pos = 38; 772 773 assert(x_step_q4 == 16); 774 775 /* bit positon for extract from acc */ 776 __asm__ __volatile__ ( 777 "wrdsp %[pos], 1 \n\t" 778 : 779 : [pos] "r" (pos) 780 ); 781 782 /* prefetch data to cache memory */ 783 prefetch_load(src); 784 prefetch_load(src + 32); 785 prefetch_store(dst); 786 787 switch (w) { 788 case 4: 789 convolve_bi_avg_horiz_4_dspr2(src, src_stride, 790 dst, dst_stride, 791 filter_x, h); 792 break; 793 case 8: 794 convolve_bi_avg_horiz_8_dspr2(src, src_stride, 795 dst, dst_stride, 796 filter_x, h); 797 break; 798 case 16: 799 convolve_bi_avg_horiz_16_dspr2(src, src_stride, 800 dst, dst_stride, 801 filter_x, h, 1); 802 break; 803 case 32: 804 convolve_bi_avg_horiz_16_dspr2(src, src_stride, 805 dst, dst_stride, 806 filter_x, h, 2); 807 break; 808 case 64: 809 prefetch_load(src + 64); 810 prefetch_store(dst + 32); 811 812 convolve_bi_avg_horiz_64_dspr2(src, src_stride, 813 dst, dst_stride, 814 filter_x, h); 815 break; 816 default: 817 vpx_convolve8_avg_horiz_c(src, src_stride, 818 dst, dst_stride, 819 filter_x, x_step_q4, 820 filter_y, y_step_q4, 821 w, h); 822 break; 823 } 824} 825#endif 826