1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_convolve.h" 17#include "vpx_dsp/vpx_dsp_common.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_avg_vert_4_dspr2(const uint8_t *src, 22 int32_t src_stride, 23 uint8_t *dst, 24 int32_t dst_stride, 25 const int16_t *filter_y, 26 int32_t w, 27 int32_t h) { 28 int32_t x, y; 29 const uint8_t *src_ptr; 30 uint8_t *dst_ptr; 31 uint8_t *cm = vpx_ff_cropTbl; 32 uint32_t vector4a = 64; 33 uint32_t load1, load2, load3, load4; 34 uint32_t p1, p2; 35 uint32_t n1, n2; 36 uint32_t scratch1, scratch2; 37 uint32_t store1, store2; 38 int32_t vector1b, vector2b, vector3b, vector4b; 39 int32_t Temp1, Temp2; 40 41 vector1b = ((const int32_t *)filter_y)[0]; 42 vector2b = ((const int32_t *)filter_y)[1]; 43 vector3b = ((const int32_t *)filter_y)[2]; 44 vector4b = ((const int32_t *)filter_y)[3]; 45 46 src -= 3 * src_stride; 47 48 for (y = h; y--;) { 49 /* prefetch data to cache memory */ 50 prefetch_store(dst + dst_stride); 51 52 for (x = 0; x < w; x += 4) { 53 src_ptr = src + x; 54 dst_ptr = dst + x; 55 56 __asm__ __volatile__ ( 57 "ulw %[load1], 0(%[src_ptr]) \n\t" 58 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 59 "ulw %[load2], 0(%[src_ptr]) \n\t" 60 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 61 "ulw %[load3], 0(%[src_ptr]) \n\t" 62 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 63 "ulw %[load4], 0(%[src_ptr]) \n\t" 64 65 "mtlo %[vector4a], $ac0 \n\t" 66 "mtlo %[vector4a], $ac1 \n\t" 67 "mtlo %[vector4a], $ac2 \n\t" 68 "mtlo %[vector4a], $ac3 \n\t" 69 "mthi $zero, $ac0 \n\t" 70 "mthi $zero, $ac1 \n\t" 71 "mthi $zero, $ac2 \n\t" 72 "mthi $zero, $ac3 \n\t" 73 74 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 75 "preceu.ph.qbr %[p1], %[load2] \n\t" 76 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 77 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 78 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 79 "preceu.ph.qbr %[p2], %[load4] \n\t" 80 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 81 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 82 83 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 84 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 85 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 86 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 87 88 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 89 "preceu.ph.qbl %[p1], %[load2] \n\t" 90 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 91 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 92 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 93 "preceu.ph.qbl %[p2], %[load4] \n\t" 94 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 95 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 96 97 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 98 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 99 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 100 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 101 102 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 103 "ulw %[load1], 0(%[src_ptr]) \n\t" 104 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 105 "ulw %[load2], 0(%[src_ptr]) \n\t" 106 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 107 "ulw %[load3], 0(%[src_ptr]) \n\t" 108 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 109 "ulw %[load4], 0(%[src_ptr]) \n\t" 110 111 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 112 "preceu.ph.qbr %[p1], %[load2] \n\t" 113 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 114 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 115 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 116 "preceu.ph.qbr %[p2], %[load4] \n\t" 117 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 118 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 119 120 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 121 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 122 "extp %[Temp1], $ac0, 31 \n\t" 123 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 124 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 125 "extp %[Temp2], $ac1, 31 \n\t" 126 127 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 128 "preceu.ph.qbl %[p1], %[load2] \n\t" 129 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 130 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 131 "lbu %[scratch1], 0(%[dst_ptr]) \n\t" 132 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 133 "preceu.ph.qbl %[p2], %[load4] \n\t" 134 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 135 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 136 "lbu %[scratch2], 1(%[dst_ptr]) \n\t" 137 138 "lbux %[store1], %[Temp1](%[cm]) \n\t" 139 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 140 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 141 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ 142 "extp %[Temp1], $ac2, 31 \n\t" 143 144 "lbux %[store2], %[Temp2](%[cm]) \n\t" 145 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 146 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 147 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ 148 "extp %[Temp2], $ac3, 31 \n\t" 149 "lbu %[scratch1], 2(%[dst_ptr]) \n\t" 150 151 "sb %[store1], 0(%[dst_ptr]) \n\t" 152 "sb %[store2], 1(%[dst_ptr]) \n\t" 153 "lbu %[scratch2], 3(%[dst_ptr]) \n\t" 154 155 "lbux %[store1], %[Temp1](%[cm]) \n\t" 156 "lbux %[store2], %[Temp2](%[cm]) \n\t" 157 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ 158 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ 159 160 "sb %[store1], 2(%[dst_ptr]) \n\t" 161 "sb %[store2], 3(%[dst_ptr]) \n\t" 162 163 : [load1] "=&r" (load1), [load2] "=&r" (load2), 164 [load3] "=&r" (load3), [load4] "=&r" (load4), 165 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), 166 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), 167 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 168 [store1] "=&r" (store1), [store2] "=&r" (store2), 169 [src_ptr] "+r" (src_ptr) 170 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 171 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 172 [vector4a] "r" (vector4a), 173 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 174 ); 175 } 176 177 /* Next row... */ 178 src += src_stride; 179 dst += dst_stride; 180 } 181} 182 183static void convolve_avg_vert_64_dspr2(const uint8_t *src, 184 int32_t src_stride, 185 uint8_t *dst, 186 int32_t dst_stride, 187 const int16_t *filter_y, 188 int32_t h) { 189 int32_t x, y; 190 const uint8_t *src_ptr; 191 uint8_t *dst_ptr; 192 uint8_t *cm = vpx_ff_cropTbl; 193 uint32_t vector4a = 64; 194 uint32_t load1, load2, load3, load4; 195 uint32_t p1, p2; 196 uint32_t n1, n2; 197 uint32_t scratch1, scratch2; 198 uint32_t store1, store2; 199 int32_t vector1b, vector2b, vector3b, vector4b; 200 int32_t Temp1, Temp2; 201 202 vector1b = ((const int32_t *)filter_y)[0]; 203 vector2b = ((const int32_t *)filter_y)[1]; 204 vector3b = ((const int32_t *)filter_y)[2]; 205 vector4b = ((const int32_t *)filter_y)[3]; 206 207 src -= 3 * src_stride; 208 209 for (y = h; y--;) { 210 /* prefetch data to cache memory */ 211 prefetch_store(dst + dst_stride); 212 prefetch_store(dst + dst_stride + 32); 213 214 for (x = 0; x < 64; x += 4) { 215 src_ptr = src + x; 216 dst_ptr = dst + x; 217 218 __asm__ __volatile__ ( 219 "ulw %[load1], 0(%[src_ptr]) \n\t" 220 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 221 "ulw %[load2], 0(%[src_ptr]) \n\t" 222 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 223 "ulw %[load3], 0(%[src_ptr]) \n\t" 224 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 225 "ulw %[load4], 0(%[src_ptr]) \n\t" 226 227 "mtlo %[vector4a], $ac0 \n\t" 228 "mtlo %[vector4a], $ac1 \n\t" 229 "mtlo %[vector4a], $ac2 \n\t" 230 "mtlo %[vector4a], $ac3 \n\t" 231 "mthi $zero, $ac0 \n\t" 232 "mthi $zero, $ac1 \n\t" 233 "mthi $zero, $ac2 \n\t" 234 "mthi $zero, $ac3 \n\t" 235 236 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 237 "preceu.ph.qbr %[p1], %[load2] \n\t" 238 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 239 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 240 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 241 "preceu.ph.qbr %[p2], %[load4] \n\t" 242 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 243 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 244 245 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 246 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 247 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 248 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 249 250 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 251 "preceu.ph.qbl %[p1], %[load2] \n\t" 252 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 253 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 254 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 255 "preceu.ph.qbl %[p2], %[load4] \n\t" 256 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 257 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 258 259 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 260 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 261 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 262 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 263 264 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 265 "ulw %[load1], 0(%[src_ptr]) \n\t" 266 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 267 "ulw %[load2], 0(%[src_ptr]) \n\t" 268 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 269 "ulw %[load3], 0(%[src_ptr]) \n\t" 270 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 271 "ulw %[load4], 0(%[src_ptr]) \n\t" 272 273 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 274 "preceu.ph.qbr %[p1], %[load2] \n\t" 275 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 276 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 277 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 278 "preceu.ph.qbr %[p2], %[load4] \n\t" 279 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 280 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 281 282 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 283 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 284 "extp %[Temp1], $ac0, 31 \n\t" 285 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 286 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 287 "extp %[Temp2], $ac1, 31 \n\t" 288 289 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 290 "preceu.ph.qbl %[p1], %[load2] \n\t" 291 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 292 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 293 "lbu %[scratch1], 0(%[dst_ptr]) \n\t" 294 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 295 "preceu.ph.qbl %[p2], %[load4] \n\t" 296 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 297 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 298 "lbu %[scratch2], 1(%[dst_ptr]) \n\t" 299 300 "lbux %[store1], %[Temp1](%[cm]) \n\t" 301 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 302 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 303 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ 304 "extp %[Temp1], $ac2, 31 \n\t" 305 306 "lbux %[store2], %[Temp2](%[cm]) \n\t" 307 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 308 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 309 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ 310 "extp %[Temp2], $ac3, 31 \n\t" 311 "lbu %[scratch1], 2(%[dst_ptr]) \n\t" 312 313 "sb %[store1], 0(%[dst_ptr]) \n\t" 314 "sb %[store2], 1(%[dst_ptr]) \n\t" 315 "lbu %[scratch2], 3(%[dst_ptr]) \n\t" 316 317 "lbux %[store1], %[Temp1](%[cm]) \n\t" 318 "lbux %[store2], %[Temp2](%[cm]) \n\t" 319 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ 320 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ 321 322 "sb %[store1], 2(%[dst_ptr]) \n\t" 323 "sb %[store2], 3(%[dst_ptr]) \n\t" 324 325 : [load1] "=&r" (load1), [load2] "=&r" (load2), 326 [load3] "=&r" (load3), [load4] "=&r" (load4), 327 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2), 328 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2), 329 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 330 [store1] "=&r" (store1), [store2] "=&r" (store2), 331 [src_ptr] "+r" (src_ptr) 332 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 333 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 334 [vector4a] "r" (vector4a), 335 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr) 336 ); 337 } 338 339 /* Next row... */ 340 src += src_stride; 341 dst += dst_stride; 342 } 343} 344 345void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, 346 uint8_t *dst, ptrdiff_t dst_stride, 347 const int16_t *filter_x, int x_step_q4, 348 const int16_t *filter_y, int y_step_q4, 349 int w, int h) { 350 assert(y_step_q4 == 16); 351 assert(((const int32_t *)filter_y)[1] != 0x800000); 352 353 if (((const int32_t *)filter_y)[0] == 0) { 354 vpx_convolve2_avg_vert_dspr2(src, src_stride, 355 dst, dst_stride, 356 filter_x, x_step_q4, 357 filter_y, y_step_q4, 358 w, h); 359 } else { 360 uint32_t pos = 38; 361 362 /* bit positon for extract from acc */ 363 __asm__ __volatile__ ( 364 "wrdsp %[pos], 1 \n\t" 365 : 366 : [pos] "r" (pos) 367 ); 368 369 prefetch_store(dst); 370 371 switch (w) { 372 case 4: 373 case 8: 374 case 16: 375 case 32: 376 convolve_avg_vert_4_dspr2(src, src_stride, 377 dst, dst_stride, 378 filter_y, w, h); 379 break; 380 case 64: 381 prefetch_store(dst + 32); 382 convolve_avg_vert_64_dspr2(src, src_stride, 383 dst, dst_stride, 384 filter_y, h); 385 break; 386 default: 387 vpx_convolve8_avg_vert_c(src, src_stride, 388 dst, dst_stride, 389 filter_x, x_step_q4, 390 filter_y, y_step_q4, 391 w, h); 392 break; 393 } 394 } 395} 396 397void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, 398 uint8_t *dst, ptrdiff_t dst_stride, 399 const int16_t *filter_x, int x_step_q4, 400 const int16_t *filter_y, int y_step_q4, 401 int w, int h) { 402 /* Fixed size intermediate buffer places limits on parameters. */ 403 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); 404 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; 405 406 assert(w <= 64); 407 assert(h <= 64); 408 assert(x_step_q4 == 16); 409 assert(y_step_q4 == 16); 410 411 if (intermediate_height < h) 412 intermediate_height = h; 413 414 vpx_convolve8_horiz(src - (src_stride * 3), src_stride, 415 temp, 64, 416 filter_x, x_step_q4, 417 filter_y, y_step_q4, 418 w, intermediate_height); 419 420 vpx_convolve8_avg_vert(temp + 64 * 3, 64, 421 dst, dst_stride, 422 filter_x, x_step_q4, 423 filter_y, y_step_q4, 424 w, h); 425} 426 427void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, 428 uint8_t *dst, ptrdiff_t dst_stride, 429 const int16_t *filter_x, int filter_x_stride, 430 const int16_t *filter_y, int filter_y_stride, 431 int w, int h) { 432 int x, y; 433 uint32_t tp1, tp2, tn1; 434 uint32_t tp3, tp4, tn2; 435 436 /* prefetch data to cache memory */ 437 prefetch_load(src); 438 prefetch_load(src + 32); 439 prefetch_store(dst); 440 441 switch (w) { 442 case 4: 443 /* 1 word storage */ 444 for (y = h; y--; ) { 445 prefetch_load(src + src_stride); 446 prefetch_load(src + src_stride + 32); 447 prefetch_store(dst + dst_stride); 448 449 __asm__ __volatile__ ( 450 "ulw %[tp1], 0(%[src]) \n\t" 451 "ulw %[tp2], 0(%[dst]) \n\t" 452 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 453 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 454 455 : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1), 456 [tp2] "=&r" (tp2) 457 : [src] "r" (src), [dst] "r" (dst) 458 ); 459 460 src += src_stride; 461 dst += dst_stride; 462 } 463 break; 464 case 8: 465 /* 2 word storage */ 466 for (y = h; y--; ) { 467 prefetch_load(src + src_stride); 468 prefetch_load(src + src_stride + 32); 469 prefetch_store(dst + dst_stride); 470 471 __asm__ __volatile__ ( 472 "ulw %[tp1], 0(%[src]) \n\t" 473 "ulw %[tp2], 0(%[dst]) \n\t" 474 "ulw %[tp3], 4(%[src]) \n\t" 475 "ulw %[tp4], 4(%[dst]) \n\t" 476 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 477 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 478 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 479 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 480 481 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 482 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 483 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) 484 : [src] "r" (src), [dst] "r" (dst) 485 ); 486 487 src += src_stride; 488 dst += dst_stride; 489 } 490 break; 491 case 16: 492 /* 4 word storage */ 493 for (y = h; y--; ) { 494 prefetch_load(src + src_stride); 495 prefetch_load(src + src_stride + 32); 496 prefetch_store(dst + dst_stride); 497 498 __asm__ __volatile__ ( 499 "ulw %[tp1], 0(%[src]) \n\t" 500 "ulw %[tp2], 0(%[dst]) \n\t" 501 "ulw %[tp3], 4(%[src]) \n\t" 502 "ulw %[tp4], 4(%[dst]) \n\t" 503 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 504 "ulw %[tp1], 8(%[src]) \n\t" 505 "ulw %[tp2], 8(%[dst]) \n\t" 506 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 507 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 508 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 509 "ulw %[tp3], 12(%[src]) \n\t" 510 "ulw %[tp4], 12(%[dst]) \n\t" 511 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 512 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 513 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 514 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 515 516 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 517 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 518 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) 519 : [src] "r" (src), [dst] "r" (dst) 520 ); 521 522 src += src_stride; 523 dst += dst_stride; 524 } 525 break; 526 case 32: 527 /* 8 word storage */ 528 for (y = h; y--; ) { 529 prefetch_load(src + src_stride); 530 prefetch_load(src + src_stride + 32); 531 prefetch_store(dst + dst_stride); 532 533 __asm__ __volatile__ ( 534 "ulw %[tp1], 0(%[src]) \n\t" 535 "ulw %[tp2], 0(%[dst]) \n\t" 536 "ulw %[tp3], 4(%[src]) \n\t" 537 "ulw %[tp4], 4(%[dst]) \n\t" 538 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 539 "ulw %[tp1], 8(%[src]) \n\t" 540 "ulw %[tp2], 8(%[dst]) \n\t" 541 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 542 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 543 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 544 "ulw %[tp3], 12(%[src]) \n\t" 545 "ulw %[tp4], 12(%[dst]) \n\t" 546 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 547 "ulw %[tp1], 16(%[src]) \n\t" 548 "ulw %[tp2], 16(%[dst]) \n\t" 549 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 550 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 551 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 552 "ulw %[tp3], 20(%[src]) \n\t" 553 "ulw %[tp4], 20(%[dst]) \n\t" 554 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 555 "ulw %[tp1], 24(%[src]) \n\t" 556 "ulw %[tp2], 24(%[dst]) \n\t" 557 "sw %[tn1], 16(%[dst]) \n\t" /* store */ 558 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 559 "sw %[tn2], 20(%[dst]) \n\t" /* store */ 560 "ulw %[tp3], 28(%[src]) \n\t" 561 "ulw %[tp4], 28(%[dst]) \n\t" 562 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 563 "sw %[tn1], 24(%[dst]) \n\t" /* store */ 564 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 565 "sw %[tn2], 28(%[dst]) \n\t" /* store */ 566 567 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 568 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 569 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) 570 : [src] "r" (src), [dst] "r" (dst) 571 ); 572 573 src += src_stride; 574 dst += dst_stride; 575 } 576 break; 577 case 64: 578 prefetch_load(src + 64); 579 prefetch_store(dst + 32); 580 581 /* 16 word storage */ 582 for (y = h; y--; ) { 583 prefetch_load(src + src_stride); 584 prefetch_load(src + src_stride + 32); 585 prefetch_load(src + src_stride + 64); 586 prefetch_store(dst + dst_stride); 587 prefetch_store(dst + dst_stride + 32); 588 589 __asm__ __volatile__ ( 590 "ulw %[tp1], 0(%[src]) \n\t" 591 "ulw %[tp2], 0(%[dst]) \n\t" 592 "ulw %[tp3], 4(%[src]) \n\t" 593 "ulw %[tp4], 4(%[dst]) \n\t" 594 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 595 "ulw %[tp1], 8(%[src]) \n\t" 596 "ulw %[tp2], 8(%[dst]) \n\t" 597 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 598 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 599 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 600 "ulw %[tp3], 12(%[src]) \n\t" 601 "ulw %[tp4], 12(%[dst]) \n\t" 602 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 603 "ulw %[tp1], 16(%[src]) \n\t" 604 "ulw %[tp2], 16(%[dst]) \n\t" 605 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 606 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 607 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 608 "ulw %[tp3], 20(%[src]) \n\t" 609 "ulw %[tp4], 20(%[dst]) \n\t" 610 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 611 "ulw %[tp1], 24(%[src]) \n\t" 612 "ulw %[tp2], 24(%[dst]) \n\t" 613 "sw %[tn1], 16(%[dst]) \n\t" /* store */ 614 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 615 "sw %[tn2], 20(%[dst]) \n\t" /* store */ 616 "ulw %[tp3], 28(%[src]) \n\t" 617 "ulw %[tp4], 28(%[dst]) \n\t" 618 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 619 "ulw %[tp1], 32(%[src]) \n\t" 620 "ulw %[tp2], 32(%[dst]) \n\t" 621 "sw %[tn1], 24(%[dst]) \n\t" /* store */ 622 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 623 "sw %[tn2], 28(%[dst]) \n\t" /* store */ 624 "ulw %[tp3], 36(%[src]) \n\t" 625 "ulw %[tp4], 36(%[dst]) \n\t" 626 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 627 "ulw %[tp1], 40(%[src]) \n\t" 628 "ulw %[tp2], 40(%[dst]) \n\t" 629 "sw %[tn1], 32(%[dst]) \n\t" /* store */ 630 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 631 "sw %[tn2], 36(%[dst]) \n\t" /* store */ 632 "ulw %[tp3], 44(%[src]) \n\t" 633 "ulw %[tp4], 44(%[dst]) \n\t" 634 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 635 "ulw %[tp1], 48(%[src]) \n\t" 636 "ulw %[tp2], 48(%[dst]) \n\t" 637 "sw %[tn1], 40(%[dst]) \n\t" /* store */ 638 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 639 "sw %[tn2], 44(%[dst]) \n\t" /* store */ 640 "ulw %[tp3], 52(%[src]) \n\t" 641 "ulw %[tp4], 52(%[dst]) \n\t" 642 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 643 "ulw %[tp1], 56(%[src]) \n\t" 644 "ulw %[tp2], 56(%[dst]) \n\t" 645 "sw %[tn1], 48(%[dst]) \n\t" /* store */ 646 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 647 "sw %[tn2], 52(%[dst]) \n\t" /* store */ 648 "ulw %[tp3], 60(%[src]) \n\t" 649 "ulw %[tp4], 60(%[dst]) \n\t" 650 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 651 "sw %[tn1], 56(%[dst]) \n\t" /* store */ 652 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 653 "sw %[tn2], 60(%[dst]) \n\t" /* store */ 654 655 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 656 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4), 657 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2) 658 : [src] "r" (src), [dst] "r" (dst) 659 ); 660 661 src += src_stride; 662 dst += dst_stride; 663 } 664 break; 665 default: 666 for (y = h; y > 0; --y) { 667 for (x = 0; x < w; ++x) { 668 dst[x] = (dst[x] + src[x] + 1) >> 1; 669 } 670 671 src += src_stride; 672 dst += dst_stride; 673 } 674 break; 675 } 676} 677#endif 678