1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_convolve.h" 17#include "vpx_dsp/vpx_dsp_common.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride, 22 uint8_t *dst, int32_t dst_stride, 23 const int16_t *filter_y, int32_t w, 24 int32_t h) { 25 int32_t x, y; 26 const uint8_t *src_ptr; 27 uint8_t *dst_ptr; 28 uint8_t *cm = vpx_ff_cropTbl; 29 uint32_t vector4a = 64; 30 uint32_t load1, load2, load3, load4; 31 uint32_t p1, p2; 32 uint32_t n1, n2; 33 uint32_t scratch1, scratch2; 34 uint32_t store1, store2; 35 int32_t vector1b, vector2b, vector3b, vector4b; 36 int32_t Temp1, Temp2; 37 38 vector1b = ((const int32_t *)filter_y)[0]; 39 vector2b = ((const int32_t *)filter_y)[1]; 40 vector3b = ((const int32_t *)filter_y)[2]; 41 vector4b = ((const int32_t *)filter_y)[3]; 42 43 src -= 3 * src_stride; 44 45 for (y = h; y--;) { 46 /* prefetch data to cache memory */ 47 prefetch_store(dst + dst_stride); 48 49 for (x = 0; x < w; x += 4) { 50 src_ptr = src + x; 51 dst_ptr = dst + x; 52 53 __asm__ __volatile__( 54 "ulw %[load1], 0(%[src_ptr]) \n\t" 55 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 56 "ulw %[load2], 0(%[src_ptr]) \n\t" 57 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 58 "ulw %[load3], 0(%[src_ptr]) \n\t" 59 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 60 "ulw %[load4], 0(%[src_ptr]) \n\t" 61 62 "mtlo %[vector4a], $ac0 \n\t" 63 "mtlo %[vector4a], $ac1 \n\t" 64 "mtlo %[vector4a], $ac2 \n\t" 65 "mtlo %[vector4a], $ac3 \n\t" 66 "mthi $zero, $ac0 \n\t" 67 "mthi $zero, $ac1 \n\t" 68 "mthi $zero, $ac2 \n\t" 69 "mthi $zero, $ac3 \n\t" 70 71 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 72 "preceu.ph.qbr %[p1], %[load2] \n\t" 73 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 74 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 75 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 76 "preceu.ph.qbr %[p2], %[load4] \n\t" 77 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 78 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 79 80 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 81 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 82 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 83 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 84 85 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 86 "preceu.ph.qbl %[p1], %[load2] \n\t" 87 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 88 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 89 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 90 "preceu.ph.qbl %[p2], %[load4] \n\t" 91 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 92 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 93 94 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 95 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 96 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 97 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 98 99 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 100 "ulw %[load1], 0(%[src_ptr]) \n\t" 101 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 102 "ulw %[load2], 0(%[src_ptr]) \n\t" 103 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 104 "ulw %[load3], 0(%[src_ptr]) \n\t" 105 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 106 "ulw %[load4], 0(%[src_ptr]) \n\t" 107 108 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 109 "preceu.ph.qbr %[p1], %[load2] \n\t" 110 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 111 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 112 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 113 "preceu.ph.qbr %[p2], %[load4] \n\t" 114 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 115 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 116 117 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 118 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 119 "extp %[Temp1], $ac0, 31 \n\t" 120 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 121 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 122 "extp %[Temp2], $ac1, 31 \n\t" 123 124 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 125 "preceu.ph.qbl %[p1], %[load2] \n\t" 126 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 127 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 128 "lbu %[scratch1], 0(%[dst_ptr]) \n\t" 129 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 130 "preceu.ph.qbl %[p2], %[load4] \n\t" 131 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 132 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 133 "lbu %[scratch2], 1(%[dst_ptr]) \n\t" 134 135 "lbux %[store1], %[Temp1](%[cm]) \n\t" 136 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 137 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 138 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ 139 "extp %[Temp1], $ac2, 31 \n\t" 140 141 "lbux %[store2], %[Temp2](%[cm]) \n\t" 142 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 143 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 144 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ 145 "extp %[Temp2], $ac3, 31 \n\t" 146 "lbu %[scratch1], 2(%[dst_ptr]) \n\t" 147 148 "sb %[store1], 0(%[dst_ptr]) \n\t" 149 "sb %[store2], 1(%[dst_ptr]) \n\t" 150 "lbu %[scratch2], 3(%[dst_ptr]) \n\t" 151 152 "lbux %[store1], %[Temp1](%[cm]) \n\t" 153 "lbux %[store2], %[Temp2](%[cm]) \n\t" 154 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ 155 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ 156 157 "sb %[store1], 2(%[dst_ptr]) \n\t" 158 "sb %[store2], 3(%[dst_ptr]) \n\t" 159 160 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 161 [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), 162 [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), 163 [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), 164 [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), 165 [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) 166 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 167 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 168 [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), 169 [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); 170 } 171 172 /* Next row... */ 173 src += src_stride; 174 dst += dst_stride; 175 } 176} 177 178static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, 179 uint8_t *dst, int32_t dst_stride, 180 const int16_t *filter_y, int32_t h) { 181 int32_t x, y; 182 const uint8_t *src_ptr; 183 uint8_t *dst_ptr; 184 uint8_t *cm = vpx_ff_cropTbl; 185 uint32_t vector4a = 64; 186 uint32_t load1, load2, load3, load4; 187 uint32_t p1, p2; 188 uint32_t n1, n2; 189 uint32_t scratch1, scratch2; 190 uint32_t store1, store2; 191 int32_t vector1b, vector2b, vector3b, vector4b; 192 int32_t Temp1, Temp2; 193 194 vector1b = ((const int32_t *)filter_y)[0]; 195 vector2b = ((const int32_t *)filter_y)[1]; 196 vector3b = ((const int32_t *)filter_y)[2]; 197 vector4b = ((const int32_t *)filter_y)[3]; 198 199 src -= 3 * src_stride; 200 201 for (y = h; y--;) { 202 /* prefetch data to cache memory */ 203 prefetch_store(dst + dst_stride); 204 prefetch_store(dst + dst_stride + 32); 205 206 for (x = 0; x < 64; x += 4) { 207 src_ptr = src + x; 208 dst_ptr = dst + x; 209 210 __asm__ __volatile__( 211 "ulw %[load1], 0(%[src_ptr]) \n\t" 212 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 213 "ulw %[load2], 0(%[src_ptr]) \n\t" 214 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 215 "ulw %[load3], 0(%[src_ptr]) \n\t" 216 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 217 "ulw %[load4], 0(%[src_ptr]) \n\t" 218 219 "mtlo %[vector4a], $ac0 \n\t" 220 "mtlo %[vector4a], $ac1 \n\t" 221 "mtlo %[vector4a], $ac2 \n\t" 222 "mtlo %[vector4a], $ac3 \n\t" 223 "mthi $zero, $ac0 \n\t" 224 "mthi $zero, $ac1 \n\t" 225 "mthi $zero, $ac2 \n\t" 226 "mthi $zero, $ac3 \n\t" 227 228 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 229 "preceu.ph.qbr %[p1], %[load2] \n\t" 230 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 231 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 232 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 233 "preceu.ph.qbr %[p2], %[load4] \n\t" 234 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 235 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 236 237 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t" 238 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t" 239 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t" 240 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t" 241 242 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 243 "preceu.ph.qbl %[p1], %[load2] \n\t" 244 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 245 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 246 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 247 "preceu.ph.qbl %[p2], %[load4] \n\t" 248 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 249 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 250 251 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t" 252 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t" 253 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 254 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 255 256 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 257 "ulw %[load1], 0(%[src_ptr]) \n\t" 258 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 259 "ulw %[load2], 0(%[src_ptr]) \n\t" 260 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 261 "ulw %[load3], 0(%[src_ptr]) \n\t" 262 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t" 263 "ulw %[load4], 0(%[src_ptr]) \n\t" 264 265 "preceu.ph.qbr %[scratch1], %[load1] \n\t" 266 "preceu.ph.qbr %[p1], %[load2] \n\t" 267 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 268 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 269 "preceu.ph.qbr %[scratch2], %[load3] \n\t" 270 "preceu.ph.qbr %[p2], %[load4] \n\t" 271 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 272 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 273 274 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t" 275 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t" 276 "extp %[Temp1], $ac0, 31 \n\t" 277 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t" 278 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t" 279 "extp %[Temp2], $ac1, 31 \n\t" 280 281 "preceu.ph.qbl %[scratch1], %[load1] \n\t" 282 "preceu.ph.qbl %[p1], %[load2] \n\t" 283 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */ 284 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */ 285 "lbu %[scratch1], 0(%[dst_ptr]) \n\t" 286 "preceu.ph.qbl %[scratch2], %[load3] \n\t" 287 "preceu.ph.qbl %[p2], %[load4] \n\t" 288 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */ 289 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */ 290 "lbu %[scratch2], 1(%[dst_ptr]) \n\t" 291 292 "lbux %[store1], %[Temp1](%[cm]) \n\t" 293 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t" 294 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 295 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */ 296 "extp %[Temp1], $ac2, 31 \n\t" 297 298 "lbux %[store2], %[Temp2](%[cm]) \n\t" 299 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t" 300 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t" 301 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */ 302 "extp %[Temp2], $ac3, 31 \n\t" 303 "lbu %[scratch1], 2(%[dst_ptr]) \n\t" 304 305 "sb %[store1], 0(%[dst_ptr]) \n\t" 306 "sb %[store2], 1(%[dst_ptr]) \n\t" 307 "lbu %[scratch2], 3(%[dst_ptr]) \n\t" 308 309 "lbux %[store1], %[Temp1](%[cm]) \n\t" 310 "lbux %[store2], %[Temp2](%[cm]) \n\t" 311 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */ 312 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */ 313 314 "sb %[store1], 2(%[dst_ptr]) \n\t" 315 "sb %[store2], 3(%[dst_ptr]) \n\t" 316 317 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), 318 [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2), 319 [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1), 320 [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1), 321 [Temp2] "=&r"(Temp2), [store1] "=&r"(store1), 322 [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr) 323 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), 324 [vector3b] "r"(vector3b), [vector4b] "r"(vector4b), 325 [vector4a] "r"(vector4a), [src_stride] "r"(src_stride), 326 [cm] "r"(cm), [dst_ptr] "r"(dst_ptr)); 327 } 328 329 /* Next row... */ 330 src += src_stride; 331 dst += dst_stride; 332 } 333} 334 335void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, 336 uint8_t *dst, ptrdiff_t dst_stride, 337 const InterpKernel *filter, int x0_q4, 338 int32_t x_step_q4, int y0_q4, int y_step_q4, 339 int w, int h) { 340 const int16_t *const filter_y = filter[y0_q4]; 341 assert(y_step_q4 == 16); 342 assert(((const int32_t *)filter_y)[1] != 0x800000); 343 344 if (((const int32_t *)filter_y)[0] == 0) { 345 vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter, 346 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); 347 } else { 348 uint32_t pos = 38; 349 350 /* bit positon for extract from acc */ 351 __asm__ __volatile__("wrdsp %[pos], 1 \n\t" 352 : 353 : [pos] "r"(pos)); 354 355 prefetch_store(dst); 356 357 switch (w) { 358 case 4: 359 case 8: 360 case 16: 361 case 32: 362 convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, 363 h); 364 break; 365 case 64: 366 prefetch_store(dst + 32); 367 convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, 368 h); 369 break; 370 default: 371 vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, 372 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); 373 break; 374 } 375 } 376} 377 378void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, 379 uint8_t *dst, ptrdiff_t dst_stride, 380 const InterpKernel *filter, int x0_q4, 381 int32_t x_step_q4, int y0_q4, int y_step_q4, int w, 382 int h) { 383 /* Fixed size intermediate buffer places limits on parameters. */ 384 DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); 385 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; 386 387 assert(w <= 64); 388 assert(h <= 64); 389 assert(x_step_q4 == 16); 390 assert(y_step_q4 == 16); 391 392 if (intermediate_height < h) intermediate_height = h; 393 394 vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter, 395 x0_q4, x_step_q4, y0_q4, y_step_q4, w, 396 intermediate_height); 397 398 vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4, 399 x_step_q4, y0_q4, y_step_q4, w, h); 400} 401 402void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, 403 uint8_t *dst, ptrdiff_t dst_stride, 404 const InterpKernel *filter, int x0_q4, 405 int32_t x_step_q4, int y0_q4, int y_step_q4, int w, 406 int h) { 407 int x, y; 408 uint32_t tp1, tp2, tn1, tp3, tp4, tn2; 409 (void)filter; 410 (void)x0_q4; 411 (void)x_step_q4; 412 (void)y0_q4; 413 (void)y_step_q4; 414 415 /* prefetch data to cache memory */ 416 prefetch_load(src); 417 prefetch_load(src + 32); 418 prefetch_store(dst); 419 420 switch (w) { 421 case 4: 422 /* 1 word storage */ 423 for (y = h; y--;) { 424 prefetch_load(src + src_stride); 425 prefetch_load(src + src_stride + 32); 426 prefetch_store(dst + dst_stride); 427 428 __asm__ __volatile__( 429 "ulw %[tp1], 0(%[src]) \n\t" 430 "ulw %[tp2], 0(%[dst]) \n\t" 431 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 432 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 433 434 : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2) 435 : [src] "r"(src), [dst] "r"(dst)); 436 437 src += src_stride; 438 dst += dst_stride; 439 } 440 break; 441 case 8: 442 /* 2 word storage */ 443 for (y = h; y--;) { 444 prefetch_load(src + src_stride); 445 prefetch_load(src + src_stride + 32); 446 prefetch_store(dst + dst_stride); 447 448 __asm__ __volatile__( 449 "ulw %[tp1], 0(%[src]) \n\t" 450 "ulw %[tp2], 0(%[dst]) \n\t" 451 "ulw %[tp3], 4(%[src]) \n\t" 452 "ulw %[tp4], 4(%[dst]) \n\t" 453 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 454 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 455 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 456 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 457 458 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 459 [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) 460 : [src] "r"(src), [dst] "r"(dst)); 461 462 src += src_stride; 463 dst += dst_stride; 464 } 465 break; 466 case 16: 467 /* 4 word storage */ 468 for (y = h; y--;) { 469 prefetch_load(src + src_stride); 470 prefetch_load(src + src_stride + 32); 471 prefetch_store(dst + dst_stride); 472 473 __asm__ __volatile__( 474 "ulw %[tp1], 0(%[src]) \n\t" 475 "ulw %[tp2], 0(%[dst]) \n\t" 476 "ulw %[tp3], 4(%[src]) \n\t" 477 "ulw %[tp4], 4(%[dst]) \n\t" 478 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 479 "ulw %[tp1], 8(%[src]) \n\t" 480 "ulw %[tp2], 8(%[dst]) \n\t" 481 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 482 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 483 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 484 "ulw %[tp3], 12(%[src]) \n\t" 485 "ulw %[tp4], 12(%[dst]) \n\t" 486 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 487 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 488 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 489 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 490 491 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 492 [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) 493 : [src] "r"(src), [dst] "r"(dst)); 494 495 src += src_stride; 496 dst += dst_stride; 497 } 498 break; 499 case 32: 500 /* 8 word storage */ 501 for (y = h; y--;) { 502 prefetch_load(src + src_stride); 503 prefetch_load(src + src_stride + 32); 504 prefetch_store(dst + dst_stride); 505 506 __asm__ __volatile__( 507 "ulw %[tp1], 0(%[src]) \n\t" 508 "ulw %[tp2], 0(%[dst]) \n\t" 509 "ulw %[tp3], 4(%[src]) \n\t" 510 "ulw %[tp4], 4(%[dst]) \n\t" 511 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 512 "ulw %[tp1], 8(%[src]) \n\t" 513 "ulw %[tp2], 8(%[dst]) \n\t" 514 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 515 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 516 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 517 "ulw %[tp3], 12(%[src]) \n\t" 518 "ulw %[tp4], 12(%[dst]) \n\t" 519 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 520 "ulw %[tp1], 16(%[src]) \n\t" 521 "ulw %[tp2], 16(%[dst]) \n\t" 522 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 523 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 524 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 525 "ulw %[tp3], 20(%[src]) \n\t" 526 "ulw %[tp4], 20(%[dst]) \n\t" 527 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 528 "ulw %[tp1], 24(%[src]) \n\t" 529 "ulw %[tp2], 24(%[dst]) \n\t" 530 "sw %[tn1], 16(%[dst]) \n\t" /* store */ 531 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 532 "sw %[tn2], 20(%[dst]) \n\t" /* store */ 533 "ulw %[tp3], 28(%[src]) \n\t" 534 "ulw %[tp4], 28(%[dst]) \n\t" 535 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 536 "sw %[tn1], 24(%[dst]) \n\t" /* store */ 537 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 538 "sw %[tn2], 28(%[dst]) \n\t" /* store */ 539 540 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 541 [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) 542 : [src] "r"(src), [dst] "r"(dst)); 543 544 src += src_stride; 545 dst += dst_stride; 546 } 547 break; 548 case 64: 549 prefetch_load(src + 64); 550 prefetch_store(dst + 32); 551 552 /* 16 word storage */ 553 for (y = h; y--;) { 554 prefetch_load(src + src_stride); 555 prefetch_load(src + src_stride + 32); 556 prefetch_load(src + src_stride + 64); 557 prefetch_store(dst + dst_stride); 558 prefetch_store(dst + dst_stride + 32); 559 560 __asm__ __volatile__( 561 "ulw %[tp1], 0(%[src]) \n\t" 562 "ulw %[tp2], 0(%[dst]) \n\t" 563 "ulw %[tp3], 4(%[src]) \n\t" 564 "ulw %[tp4], 4(%[dst]) \n\t" 565 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 566 "ulw %[tp1], 8(%[src]) \n\t" 567 "ulw %[tp2], 8(%[dst]) \n\t" 568 "sw %[tn1], 0(%[dst]) \n\t" /* store */ 569 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 570 "sw %[tn2], 4(%[dst]) \n\t" /* store */ 571 "ulw %[tp3], 12(%[src]) \n\t" 572 "ulw %[tp4], 12(%[dst]) \n\t" 573 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 574 "ulw %[tp1], 16(%[src]) \n\t" 575 "ulw %[tp2], 16(%[dst]) \n\t" 576 "sw %[tn1], 8(%[dst]) \n\t" /* store */ 577 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 578 "sw %[tn2], 12(%[dst]) \n\t" /* store */ 579 "ulw %[tp3], 20(%[src]) \n\t" 580 "ulw %[tp4], 20(%[dst]) \n\t" 581 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 582 "ulw %[tp1], 24(%[src]) \n\t" 583 "ulw %[tp2], 24(%[dst]) \n\t" 584 "sw %[tn1], 16(%[dst]) \n\t" /* store */ 585 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 586 "sw %[tn2], 20(%[dst]) \n\t" /* store */ 587 "ulw %[tp3], 28(%[src]) \n\t" 588 "ulw %[tp4], 28(%[dst]) \n\t" 589 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 590 "ulw %[tp1], 32(%[src]) \n\t" 591 "ulw %[tp2], 32(%[dst]) \n\t" 592 "sw %[tn1], 24(%[dst]) \n\t" /* store */ 593 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 594 "sw %[tn2], 28(%[dst]) \n\t" /* store */ 595 "ulw %[tp3], 36(%[src]) \n\t" 596 "ulw %[tp4], 36(%[dst]) \n\t" 597 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 598 "ulw %[tp1], 40(%[src]) \n\t" 599 "ulw %[tp2], 40(%[dst]) \n\t" 600 "sw %[tn1], 32(%[dst]) \n\t" /* store */ 601 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 602 "sw %[tn2], 36(%[dst]) \n\t" /* store */ 603 "ulw %[tp3], 44(%[src]) \n\t" 604 "ulw %[tp4], 44(%[dst]) \n\t" 605 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 606 "ulw %[tp1], 48(%[src]) \n\t" 607 "ulw %[tp2], 48(%[dst]) \n\t" 608 "sw %[tn1], 40(%[dst]) \n\t" /* store */ 609 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 610 "sw %[tn2], 44(%[dst]) \n\t" /* store */ 611 "ulw %[tp3], 52(%[src]) \n\t" 612 "ulw %[tp4], 52(%[dst]) \n\t" 613 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 614 "ulw %[tp1], 56(%[src]) \n\t" 615 "ulw %[tp2], 56(%[dst]) \n\t" 616 "sw %[tn1], 48(%[dst]) \n\t" /* store */ 617 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 618 "sw %[tn2], 52(%[dst]) \n\t" /* store */ 619 "ulw %[tp3], 60(%[src]) \n\t" 620 "ulw %[tp4], 60(%[dst]) \n\t" 621 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */ 622 "sw %[tn1], 56(%[dst]) \n\t" /* store */ 623 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */ 624 "sw %[tn2], 60(%[dst]) \n\t" /* store */ 625 626 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), 627 [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2) 628 : [src] "r"(src), [dst] "r"(dst)); 629 630 src += src_stride; 631 dst += dst_stride; 632 } 633 break; 634 default: 635 for (y = h; y > 0; --y) { 636 for (x = 0; x < w; ++x) { 637 dst[x] = (dst[x] + src[x] + 1) >> 1; 638 } 639 640 src += src_stride; 641 dst += dst_stride; 642 } 643 break; 644 } 645} 646#endif 647