convolve2_avg_horiz_dspr2.c revision 7bc9febe8749e98a3812a0dc4380ceae75c29450
1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_convolve.h" 17#include "vpx_dsp/vpx_dsp_common.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src, 22 int32_t src_stride, uint8_t *dst, 23 int32_t dst_stride, 24 const int16_t *filter_x0, int32_t h) { 25 int32_t y; 26 uint8_t *cm = vpx_ff_cropTbl; 27 int32_t Temp1, Temp2, Temp3, Temp4; 28 uint32_t vector4a = 64; 29 uint32_t tp1, tp2; 30 uint32_t p1, p2, p3; 31 uint32_t tn1, tn2; 32 const int16_t *filter = &filter_x0[3]; 33 uint32_t filter45; 34 35 filter45 = ((const int32_t *)filter)[0]; 36 37 for (y = h; y--;) { 38 /* prefetch data to cache memory */ 39 prefetch_load(src + src_stride); 40 prefetch_load(src + src_stride + 32); 41 prefetch_store(dst + dst_stride); 42 43 __asm__ __volatile__( 44 "ulw %[tp1], 0(%[src]) \n\t" 45 "ulw %[tp2], 4(%[src]) \n\t" 46 47 /* even 1. pixel */ 48 "mtlo %[vector4a], $ac3 \n\t" 49 "mthi $zero, $ac3 \n\t" 50 "preceu.ph.qbr %[p1], %[tp1] \n\t" 51 "preceu.ph.qbl %[p2], %[tp1] \n\t" 52 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 53 "extp %[Temp1], $ac3, 31 \n\t" 54 55 /* even 2. pixel */ 56 "mtlo %[vector4a], $ac2 \n\t" 57 "mthi $zero, $ac2 \n\t" 58 "balign %[tp2], %[tp1], 3 \n\t" 59 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 60 "extp %[Temp3], $ac2, 31 \n\t" 61 62 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ 63 64 /* odd 1. pixel */ 65 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ 66 "mtlo %[vector4a], $ac3 \n\t" 67 "mthi $zero, $ac3 \n\t" 68 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ 69 "preceu.ph.qbr %[p1], %[tp2] \n\t" 70 "preceu.ph.qbl %[p3], %[tp2] \n\t" 71 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 72 "extp %[Temp2], $ac3, 31 \n\t" 73 74 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ 75 76 /* odd 2. pixel */ 77 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ 78 "mtlo %[vector4a], $ac2 \n\t" 79 "mthi $zero, $ac2 \n\t" 80 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ 81 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ 82 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" 83 "extp %[Temp4], $ac2, 31 \n\t" 84 85 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ 86 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ 87 88 /* clamp */ 89 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ 90 "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */ 91 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ 92 93 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ 94 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ 95 96 "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */ 97 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ 98 99 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1), 100 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), 101 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), 102 [Temp4] "=&r"(Temp4) 103 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), 104 [dst] "r"(dst), [src] "r"(src)); 105 106 /* Next row... */ 107 src += src_stride; 108 dst += dst_stride; 109 } 110} 111 112static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src, 113 int32_t src_stride, uint8_t *dst, 114 int32_t dst_stride, 115 const int16_t *filter_x0, int32_t h) { 116 int32_t y; 117 uint8_t *cm = vpx_ff_cropTbl; 118 uint32_t vector4a = 64; 119 int32_t Temp1, Temp2, Temp3; 120 uint32_t tp1, tp2, tp3, tp4; 121 uint32_t p1, p2, p3, p4, n1; 122 uint32_t st0, st1; 123 const int16_t *filter = &filter_x0[3]; 124 uint32_t filter45; 125 126 filter45 = ((const int32_t *)filter)[0]; 127 128 for (y = h; y--;) { 129 /* prefetch data to cache memory */ 130 prefetch_load(src + src_stride); 131 prefetch_load(src + src_stride + 32); 132 prefetch_store(dst + dst_stride); 133 134 __asm__ __volatile__( 135 "ulw %[tp1], 0(%[src]) \n\t" 136 "ulw %[tp2], 4(%[src]) \n\t" 137 138 /* even 1. pixel */ 139 "mtlo %[vector4a], $ac3 \n\t" 140 "mthi $zero, $ac3 \n\t" 141 "mtlo %[vector4a], $ac2 \n\t" 142 "mthi $zero, $ac2 \n\t" 143 "preceu.ph.qbr %[p1], %[tp1] \n\t" 144 "preceu.ph.qbl %[p2], %[tp1] \n\t" 145 "preceu.ph.qbr %[p3], %[tp2] \n\t" 146 "preceu.ph.qbl %[p4], %[tp2] \n\t" 147 "ulw %[tp3], 8(%[src]) \n\t" 148 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 149 "extp %[Temp1], $ac3, 31 \n\t" 150 "lbu %[Temp2], 0(%[dst]) \n\t" 151 "lbu %[tp4], 2(%[dst]) \n\t" 152 153 /* even 2. pixel */ 154 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" 155 "extp %[Temp3], $ac2, 31 \n\t" 156 157 /* even 3. pixel */ 158 "lbux %[st0], %[Temp1](%[cm]) \n\t" 159 "mtlo %[vector4a], $ac1 \n\t" 160 "mthi $zero, $ac1 \n\t" 161 "lbux %[st1], %[Temp3](%[cm]) \n\t" 162 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" 163 "extp %[Temp1], $ac1, 31 \n\t" 164 165 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 166 "addqh_r.w %[tp4], %[tp4], %[st1] \n\t" 167 "sb %[Temp2], 0(%[dst]) \n\t" 168 "sb %[tp4], 2(%[dst]) \n\t" 169 170 /* even 4. pixel */ 171 "mtlo %[vector4a], $ac2 \n\t" 172 "mthi $zero, $ac2 \n\t" 173 "mtlo %[vector4a], $ac3 \n\t" 174 "mthi $zero, $ac3 \n\t" 175 176 "balign %[tp3], %[tp2], 3 \n\t" 177 "balign %[tp2], %[tp1], 3 \n\t" 178 179 "lbux %[st0], %[Temp1](%[cm]) \n\t" 180 "lbu %[Temp2], 4(%[dst]) \n\t" 181 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 182 183 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 184 "extp %[Temp3], $ac2, 31 \n\t" 185 186 /* odd 1. pixel */ 187 "mtlo %[vector4a], $ac1 \n\t" 188 "mthi $zero, $ac1 \n\t" 189 "sb %[Temp2], 4(%[dst]) \n\t" 190 "preceu.ph.qbr %[p1], %[tp2] \n\t" 191 "preceu.ph.qbl %[p2], %[tp2] \n\t" 192 "preceu.ph.qbr %[p3], %[tp3] \n\t" 193 "preceu.ph.qbl %[p4], %[tp3] \n\t" 194 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" 195 "extp %[Temp2], $ac3, 31 \n\t" 196 197 "lbu %[tp1], 6(%[dst]) \n\t" 198 199 /* odd 2. pixel */ 200 "mtlo %[vector4a], $ac3 \n\t" 201 "mthi $zero, $ac3 \n\t" 202 "mtlo %[vector4a], $ac2 \n\t" 203 "mthi $zero, $ac2 \n\t" 204 "lbux %[st0], %[Temp3](%[cm]) \n\t" 205 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" 206 "extp %[Temp3], $ac1, 31 \n\t" 207 208 "lbu %[tp2], 1(%[dst]) \n\t" 209 "lbu %[tp3], 3(%[dst]) \n\t" 210 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" 211 212 /* odd 3. pixel */ 213 "lbux %[st1], %[Temp2](%[cm]) \n\t" 214 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" 215 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" 216 "extp %[Temp2], $ac3, 31 \n\t" 217 218 "lbu %[tp4], 5(%[dst]) \n\t" 219 220 /* odd 4. pixel */ 221 "sb %[tp2], 1(%[dst]) \n\t" 222 "sb %[tp1], 6(%[dst]) \n\t" 223 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t" 224 "extp %[Temp1], $ac2, 31 \n\t" 225 226 "lbu %[tp1], 7(%[dst]) \n\t" 227 228 /* clamp */ 229 "lbux %[p4], %[Temp3](%[cm]) \n\t" 230 "addqh_r.w %[tp3], %[tp3], %[p4] \n\t" 231 232 "lbux %[p2], %[Temp2](%[cm]) \n\t" 233 "addqh_r.w %[tp4], %[tp4], %[p2] \n\t" 234 235 "lbux %[p1], %[Temp1](%[cm]) \n\t" 236 "addqh_r.w %[tp1], %[tp1], %[p1] \n\t" 237 238 /* store bytes */ 239 "sb %[tp3], 3(%[dst]) \n\t" 240 "sb %[tp4], 5(%[dst]) \n\t" 241 "sb %[tp1], 7(%[dst]) \n\t" 242 243 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 244 [tp4] "=&r"(tp4), [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), 245 [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1), 246 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3) 247 : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm), 248 [dst] "r"(dst), [src] "r"(src)); 249 250 /* Next row... */ 251 src += src_stride; 252 dst += dst_stride; 253 } 254} 255 256static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr, 257 int32_t src_stride, uint8_t *dst_ptr, 258 int32_t dst_stride, 259 const int16_t *filter_x0, int32_t h, 260 int32_t count) { 261 int32_t y, c; 262 const uint8_t *src; 263 uint8_t *dst; 264 uint8_t *cm = vpx_ff_cropTbl; 265 uint32_t vector_64 = 64; 266 int32_t Temp1, Temp2, Temp3; 267 uint32_t qload1, qload2, qload3; 268 uint32_t p1, p2, p3, p4, p5; 269 uint32_t st1, st2, st3; 270 const int16_t *filter = &filter_x0[3]; 271 uint32_t filter45; 272 273 filter45 = ((const int32_t *)filter)[0]; 274 275 for (y = h; y--;) { 276 src = src_ptr; 277 dst = dst_ptr; 278 279 /* prefetch data to cache memory */ 280 prefetch_load(src_ptr + src_stride); 281 prefetch_load(src_ptr + src_stride + 32); 282 prefetch_store(dst_ptr + dst_stride); 283 284 for (c = 0; c < count; c++) { 285 __asm__ __volatile__( 286 "ulw %[qload1], 0(%[src]) \n\t" 287 "ulw %[qload2], 4(%[src]) \n\t" 288 289 /* even 1. pixel */ 290 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 291 "mthi $zero, $ac1 \n\t" 292 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 293 "mthi $zero, $ac2 \n\t" 294 "preceu.ph.qbr %[p1], %[qload1] \n\t" 295 "preceu.ph.qbl %[p2], %[qload1] \n\t" 296 "preceu.ph.qbr %[p3], %[qload2] \n\t" 297 "preceu.ph.qbl %[p4], %[qload2] \n\t" 298 "ulw %[qload3], 8(%[src]) \n\t" 299 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 300 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 301 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 302 303 /* even 2. pixel */ 304 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 305 "mthi $zero, $ac3 \n\t" 306 "preceu.ph.qbr %[p1], %[qload3] \n\t" 307 "preceu.ph.qbl %[p5], %[qload3] \n\t" 308 "ulw %[qload1], 12(%[src]) \n\t" 309 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 310 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 311 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 312 313 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 314 315 /* even 3. pixel */ 316 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 317 "mthi $zero, $ac1 \n\t" 318 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 319 "preceu.ph.qbr %[p2], %[qload1] \n\t" 320 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 321 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 322 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 323 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 324 325 /* even 4. pixel */ 326 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 327 "mthi $zero, $ac2 \n\t" 328 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 329 "preceu.ph.qbl %[p3], %[qload1] \n\t" 330 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 331 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 332 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 333 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 334 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 335 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 336 337 /* even 5. pixel */ 338 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 339 "mthi $zero, $ac3 \n\t" 340 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 341 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 342 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 343 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 344 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 345 346 /* even 6. pixel */ 347 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 348 "mthi $zero, $ac1 \n\t" 349 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 350 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 351 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 352 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 353 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 354 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 355 356 /* even 7. pixel */ 357 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 358 "mthi $zero, $ac2 \n\t" 359 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 360 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 361 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 362 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 363 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 364 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 365 366 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 367 368 /* even 8. pixel */ 369 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 370 "mthi $zero, $ac3 \n\t" 371 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 372 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 373 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 374 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 375 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 376 377 /* ODD pixels */ 378 "ulw %[qload1], 1(%[src]) \n\t" 379 "ulw %[qload2], 5(%[src]) \n\t" 380 381 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 382 383 /* odd 1. pixel */ 384 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 385 "mthi $zero, $ac1 \n\t" 386 "preceu.ph.qbr %[p1], %[qload1] \n\t" 387 "preceu.ph.qbl %[p2], %[qload1] \n\t" 388 "preceu.ph.qbr %[p3], %[qload2] \n\t" 389 "preceu.ph.qbl %[p4], %[qload2] \n\t" 390 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 391 "ulw %[qload3], 9(%[src]) \n\t" 392 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 393 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 394 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 395 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 396 397 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 398 399 /* odd 2. pixel */ 400 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 401 "mthi $zero, $ac2 \n\t" 402 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 403 "preceu.ph.qbr %[p1], %[qload3] \n\t" 404 "preceu.ph.qbl %[p5], %[qload3] \n\t" 405 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 406 "ulw %[qload1], 13(%[src]) \n\t" 407 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 408 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 409 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 410 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 411 412 /* odd 3. pixel */ 413 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 414 "mthi $zero, $ac3 \n\t" 415 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 416 "preceu.ph.qbr %[p2], %[qload1] \n\t" 417 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 418 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 419 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 420 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 421 422 /* odd 4. pixel */ 423 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 424 "mthi $zero, $ac1 \n\t" 425 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 426 "preceu.ph.qbl %[p3], %[qload1] \n\t" 427 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 428 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 429 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 430 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 431 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 432 433 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 434 435 /* odd 5. pixel */ 436 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 437 "mthi $zero, $ac2 \n\t" 438 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 439 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 440 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 441 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 442 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 443 444 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 445 446 /* odd 6. pixel */ 447 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 448 "mthi $zero, $ac3 \n\t" 449 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 450 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 451 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 452 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 453 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 454 455 /* odd 7. pixel */ 456 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 457 "mthi $zero, $ac1 \n\t" 458 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 459 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 460 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 461 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 462 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 463 464 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 465 466 /* odd 8. pixel */ 467 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 468 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 469 470 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 471 472 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 473 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 474 475 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 476 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 477 478 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 479 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 480 481 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 482 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 483 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 484 485 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), 486 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), 487 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), 488 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 489 [Temp3] "=&r"(Temp3) 490 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), 491 [dst] "r"(dst), [src] "r"(src)); 492 493 src += 16; 494 dst += 16; 495 } 496 497 /* Next row... */ 498 src_ptr += src_stride; 499 dst_ptr += dst_stride; 500 } 501} 502 503static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, 504 int32_t src_stride, uint8_t *dst_ptr, 505 int32_t dst_stride, 506 const int16_t *filter_x0, 507 int32_t h) { 508 int32_t y, c; 509 const uint8_t *src; 510 uint8_t *dst; 511 uint8_t *cm = vpx_ff_cropTbl; 512 uint32_t vector_64 = 64; 513 int32_t Temp1, Temp2, Temp3; 514 uint32_t qload1, qload2, qload3; 515 uint32_t p1, p2, p3, p4, p5; 516 uint32_t st1, st2, st3; 517 const int16_t *filter = &filter_x0[3]; 518 uint32_t filter45; 519 520 filter45 = ((const int32_t *)filter)[0]; 521 522 for (y = h; y--;) { 523 src = src_ptr; 524 dst = dst_ptr; 525 526 /* prefetch data to cache memory */ 527 prefetch_load(src_ptr + src_stride); 528 prefetch_load(src_ptr + src_stride + 32); 529 prefetch_load(src_ptr + src_stride + 64); 530 prefetch_store(dst_ptr + dst_stride); 531 prefetch_store(dst_ptr + dst_stride + 32); 532 533 for (c = 0; c < 4; c++) { 534 __asm__ __volatile__( 535 "ulw %[qload1], 0(%[src]) \n\t" 536 "ulw %[qload2], 4(%[src]) \n\t" 537 538 /* even 1. pixel */ 539 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 540 "mthi $zero, $ac1 \n\t" 541 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 542 "mthi $zero, $ac2 \n\t" 543 "preceu.ph.qbr %[p1], %[qload1] \n\t" 544 "preceu.ph.qbl %[p2], %[qload1] \n\t" 545 "preceu.ph.qbr %[p3], %[qload2] \n\t" 546 "preceu.ph.qbl %[p4], %[qload2] \n\t" 547 "ulw %[qload3], 8(%[src]) \n\t" 548 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */ 549 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 550 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 551 552 /* even 2. pixel */ 553 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 554 "mthi $zero, $ac3 \n\t" 555 "preceu.ph.qbr %[p1], %[qload3] \n\t" 556 "preceu.ph.qbl %[p5], %[qload3] \n\t" 557 "ulw %[qload1], 12(%[src]) \n\t" 558 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */ 559 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 560 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 561 562 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 563 564 /* even 3. pixel */ 565 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 566 "mthi $zero, $ac1 \n\t" 567 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 568 "preceu.ph.qbr %[p2], %[qload1] \n\t" 569 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 570 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */ 571 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 572 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 573 574 /* even 4. pixel */ 575 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 576 "mthi $zero, $ac2 \n\t" 577 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 578 "preceu.ph.qbl %[p3], %[qload1] \n\t" 579 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 580 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 581 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 582 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */ 583 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 584 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 585 586 /* even 5. pixel */ 587 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 588 "mthi $zero, $ac3 \n\t" 589 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 590 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 591 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */ 592 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 593 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 594 595 /* even 6. pixel */ 596 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 597 "mthi $zero, $ac1 \n\t" 598 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 599 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 600 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */ 601 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 602 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 603 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 604 605 /* even 7. pixel */ 606 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 607 "mthi $zero, $ac2 \n\t" 608 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 609 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 610 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */ 611 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 612 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 613 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 614 615 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 616 617 /* even 8. pixel */ 618 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 619 "mthi $zero, $ac3 \n\t" 620 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 621 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */ 622 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 623 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 624 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 625 626 /* ODD pixels */ 627 "ulw %[qload1], 1(%[src]) \n\t" 628 "ulw %[qload2], 5(%[src]) \n\t" 629 630 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 631 632 /* odd 1. pixel */ 633 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 634 "mthi $zero, $ac1 \n\t" 635 "preceu.ph.qbr %[p1], %[qload1] \n\t" 636 "preceu.ph.qbl %[p2], %[qload1] \n\t" 637 "preceu.ph.qbr %[p3], %[qload2] \n\t" 638 "preceu.ph.qbl %[p4], %[qload2] \n\t" 639 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 640 "ulw %[qload3], 9(%[src]) \n\t" 641 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */ 642 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 643 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 644 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 645 646 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 647 648 /* odd 2. pixel */ 649 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 650 "mthi $zero, $ac2 \n\t" 651 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 652 "preceu.ph.qbr %[p1], %[qload3] \n\t" 653 "preceu.ph.qbl %[p5], %[qload3] \n\t" 654 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 655 "ulw %[qload1], 13(%[src]) \n\t" 656 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */ 657 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 658 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 659 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 660 661 /* odd 3. pixel */ 662 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 663 "mthi $zero, $ac3 \n\t" 664 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 665 "preceu.ph.qbr %[p2], %[qload1] \n\t" 666 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */ 667 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 668 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 669 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 670 671 /* odd 4. pixel */ 672 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 673 "mthi $zero, $ac1 \n\t" 674 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 675 "preceu.ph.qbl %[p3], %[qload1] \n\t" 676 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 677 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 678 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */ 679 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 680 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 681 682 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 683 684 /* odd 5. pixel */ 685 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 686 "mthi $zero, $ac2 \n\t" 687 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 688 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 689 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */ 690 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 691 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 692 693 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 694 695 /* odd 6. pixel */ 696 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 697 "mthi $zero, $ac3 \n\t" 698 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 699 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 700 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */ 701 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 702 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 703 704 /* odd 7. pixel */ 705 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 706 "mthi $zero, $ac1 \n\t" 707 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 708 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 709 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 710 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */ 711 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 712 713 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 714 715 /* odd 8. pixel */ 716 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */ 717 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 718 719 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 720 721 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 722 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 723 724 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 725 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 726 727 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 728 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 729 730 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 731 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 732 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 733 734 : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [st1] "=&r"(st1), 735 [st2] "=&r"(st2), [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), 736 [p3] "=&r"(p3), [p4] "=&r"(p4), [qload3] "=&r"(qload3), 737 [p5] "=&r"(p5), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), 738 [Temp3] "=&r"(Temp3) 739 : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm), 740 [dst] "r"(dst), [src] "r"(src)); 741 742 src += 16; 743 dst += 16; 744 } 745 746 /* Next row... */ 747 src_ptr += src_stride; 748 dst_ptr += dst_stride; 749 } 750} 751 752void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 753 uint8_t *dst, ptrdiff_t dst_stride, 754 const int16_t *filter_x, int x_step_q4, 755 const int16_t *filter_y, int y_step_q4, 756 int w, int h) { 757 uint32_t pos = 38; 758 759 assert(x_step_q4 == 16); 760 761 /* bit positon for extract from acc */ 762 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 763 : 764 : [pos] "r"(pos)); 765 766 /* prefetch data to cache memory */ 767 prefetch_load(src); 768 prefetch_load(src + 32); 769 prefetch_store(dst); 770 771 switch (w) { 772 case 4: 773 convolve_bi_avg_horiz_4_dspr2(src, src_stride, dst, dst_stride, filter_x, 774 h); 775 break; 776 case 8: 777 convolve_bi_avg_horiz_8_dspr2(src, src_stride, dst, dst_stride, filter_x, 778 h); 779 break; 780 case 16: 781 convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, 782 h, 1); 783 break; 784 case 32: 785 convolve_bi_avg_horiz_16_dspr2(src, src_stride, dst, dst_stride, filter_x, 786 h, 2); 787 break; 788 case 64: 789 prefetch_load(src + 64); 790 prefetch_store(dst + 32); 791 792 convolve_bi_avg_horiz_64_dspr2(src, src_stride, dst, dst_stride, filter_x, 793 h); 794 break; 795 default: 796 vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, 797 x_step_q4, filter_y, y_step_q4, w, h); 798 break; 799 } 800} 801#endif 802