convolve8_avg_horiz_dspr2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <assert.h> 12#include <stdio.h> 13 14#include "./vpx_dsp_rtcd.h" 15#include "vpx_dsp/mips/convolve_common_dspr2.h" 16#include "vpx_dsp/vpx_convolve.h" 17#include "vpx_dsp/vpx_dsp_common.h" 18#include "vpx_ports/mem.h" 19 20#if HAVE_DSPR2 21static void convolve_avg_horiz_4_dspr2(const uint8_t *src, 22 int32_t src_stride, 23 uint8_t *dst, 24 int32_t dst_stride, 25 const int16_t *filter_x0, 26 int32_t h) { 27 int32_t y; 28 uint8_t *cm = vpx_ff_cropTbl; 29 int32_t vector1b, vector2b, vector3b, vector4b; 30 int32_t Temp1, Temp2, Temp3, Temp4; 31 uint32_t vector4a = 64; 32 uint32_t tp1, tp2; 33 uint32_t p1, p2, p3, p4; 34 uint32_t n1, n2, n3, n4; 35 uint32_t tn1, tn2; 36 37 vector1b = ((const int32_t *)filter_x0)[0]; 38 vector2b = ((const int32_t *)filter_x0)[1]; 39 vector3b = ((const int32_t *)filter_x0)[2]; 40 vector4b = ((const int32_t *)filter_x0)[3]; 41 42 for (y = h; y--;) { 43 /* prefetch data to cache memory */ 44 prefetch_load(src + src_stride); 45 prefetch_load(src + src_stride + 32); 46 prefetch_store(dst + dst_stride); 47 48 __asm__ __volatile__ ( 49 "ulw %[tp1], 0(%[src]) \n\t" 50 "ulw %[tp2], 4(%[src]) \n\t" 51 52 /* even 1. pixel */ 53 "mtlo %[vector4a], $ac3 \n\t" 54 "mthi $zero, $ac3 \n\t" 55 "preceu.ph.qbr %[p1], %[tp1] \n\t" 56 "preceu.ph.qbl %[p2], %[tp1] \n\t" 57 "preceu.ph.qbr %[p3], %[tp2] \n\t" 58 "preceu.ph.qbl %[p4], %[tp2] \n\t" 59 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 60 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 61 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 62 "ulw %[tn2], 8(%[src]) \n\t" 63 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 64 "extp %[Temp1], $ac3, 31 \n\t" 65 66 /* even 2. pixel */ 67 "mtlo %[vector4a], $ac2 \n\t" 68 "mthi $zero, $ac2 \n\t" 69 "preceu.ph.qbr %[p1], %[tn2] \n\t" 70 "balign %[tn1], %[tn2], 3 \n\t" 71 "balign %[tn2], %[tp2], 3 \n\t" 72 "balign %[tp2], %[tp1], 3 \n\t" 73 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 74 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 75 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 76 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 77 "extp %[Temp3], $ac2, 31 \n\t" 78 79 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */ 80 81 /* odd 1. pixel */ 82 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */ 83 "mtlo %[vector4a], $ac3 \n\t" 84 "mthi $zero, $ac3 \n\t" 85 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */ 86 "preceu.ph.qbr %[n1], %[tp2] \n\t" 87 "preceu.ph.qbl %[n2], %[tp2] \n\t" 88 "preceu.ph.qbr %[n3], %[tn2] \n\t" 89 "preceu.ph.qbl %[n4], %[tn2] \n\t" 90 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t" 91 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t" 92 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t" 93 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t" 94 "extp %[Temp2], $ac3, 31 \n\t" 95 96 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */ 97 98 /* odd 2. pixel */ 99 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */ 100 "mtlo %[vector4a], $ac2 \n\t" 101 "mthi $zero, $ac2 \n\t" 102 "preceu.ph.qbr %[n1], %[tn1] \n\t" 103 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */ 104 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */ 105 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t" 106 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t" 107 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t" 108 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t" 109 "extp %[Temp4], $ac2, 31 \n\t" 110 111 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */ 112 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */ 113 114 /* clamp */ 115 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */ 116 "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */ 117 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */ 118 119 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */ 120 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */ 121 122 "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */ 123 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */ 124 125 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 126 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), 127 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 128 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4), 129 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), 130 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4) 131 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 132 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 133 [vector4a] "r" (vector4a), 134 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 135 ); 136 137 /* Next row... */ 138 src += src_stride; 139 dst += dst_stride; 140 } 141} 142 143static void convolve_avg_horiz_8_dspr2(const uint8_t *src, 144 int32_t src_stride, 145 uint8_t *dst, 146 int32_t dst_stride, 147 const int16_t *filter_x0, 148 int32_t h) { 149 int32_t y; 150 uint8_t *cm = vpx_ff_cropTbl; 151 uint32_t vector4a = 64; 152 int32_t vector1b, vector2b, vector3b, vector4b; 153 int32_t Temp1, Temp2, Temp3; 154 uint32_t tp1, tp2; 155 uint32_t p1, p2, p3, p4, n1; 156 uint32_t tn1, tn2, tn3; 157 uint32_t st0, st1; 158 159 vector1b = ((const int32_t *)filter_x0)[0]; 160 vector2b = ((const int32_t *)filter_x0)[1]; 161 vector3b = ((const int32_t *)filter_x0)[2]; 162 vector4b = ((const int32_t *)filter_x0)[3]; 163 164 for (y = h; y--;) { 165 /* prefetch data to cache memory */ 166 prefetch_load(src + src_stride); 167 prefetch_load(src + src_stride + 32); 168 prefetch_store(dst + dst_stride); 169 170 __asm__ __volatile__ ( 171 "ulw %[tp1], 0(%[src]) \n\t" 172 "ulw %[tp2], 4(%[src]) \n\t" 173 174 /* even 1. pixel */ 175 "mtlo %[vector4a], $ac3 \n\t" 176 "mthi $zero, $ac3 \n\t" 177 "mtlo %[vector4a], $ac2 \n\t" 178 "mthi $zero, $ac2 \n\t" 179 "preceu.ph.qbr %[p1], %[tp1] \n\t" 180 "preceu.ph.qbl %[p2], %[tp1] \n\t" 181 "preceu.ph.qbr %[p3], %[tp2] \n\t" 182 "preceu.ph.qbl %[p4], %[tp2] \n\t" 183 "ulw %[tn2], 8(%[src]) \n\t" 184 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 185 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 186 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 187 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 188 "extp %[Temp1], $ac3, 31 \n\t" 189 "lbu %[Temp2], 0(%[dst]) \n\t" 190 "lbu %[tn3], 2(%[dst]) \n\t" 191 192 /* even 2. pixel */ 193 "preceu.ph.qbr %[p1], %[tn2] \n\t" 194 "preceu.ph.qbl %[n1], %[tn2] \n\t" 195 "ulw %[tn1], 12(%[src]) \n\t" 196 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t" 197 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t" 198 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t" 199 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t" 200 "extp %[Temp3], $ac2, 31 \n\t" 201 202 /* even 3. pixel */ 203 "lbux %[st0], %[Temp1](%[cm]) \n\t" 204 "mtlo %[vector4a], $ac1 \n\t" 205 "mthi $zero, $ac1 \n\t" 206 "preceu.ph.qbr %[p2], %[tn1] \n\t" 207 "lbux %[st1], %[Temp3](%[cm]) \n\t" 208 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t" 209 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t" 210 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t" 211 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t" 212 "extp %[Temp1], $ac1, 31 \n\t" 213 214 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 215 "addqh_r.w %[tn3], %[tn3], %[st1] \n\t" 216 "sb %[Temp2], 0(%[dst]) \n\t" 217 "sb %[tn3], 2(%[dst]) \n\t" 218 219 /* even 4. pixel */ 220 "mtlo %[vector4a], $ac2 \n\t" 221 "mthi $zero, $ac2 \n\t" 222 "mtlo %[vector4a], $ac3 \n\t" 223 "mthi $zero, $ac3 \n\t" 224 225 "balign %[tn3], %[tn1], 3 \n\t" 226 "balign %[tn1], %[tn2], 3 \n\t" 227 "balign %[tn2], %[tp2], 3 \n\t" 228 "balign %[tp2], %[tp1], 3 \n\t" 229 230 "lbux %[st0], %[Temp1](%[cm]) \n\t" 231 "lbu %[Temp2], 4(%[dst]) \n\t" 232 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t" 233 234 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 235 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 236 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 237 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 238 "extp %[Temp3], $ac2, 31 \n\t" 239 240 /* odd 1. pixel */ 241 "mtlo %[vector4a], $ac1 \n\t" 242 "mthi $zero, $ac1 \n\t" 243 "sb %[Temp2], 4(%[dst]) \n\t" 244 "preceu.ph.qbr %[p1], %[tp2] \n\t" 245 "preceu.ph.qbl %[p2], %[tp2] \n\t" 246 "preceu.ph.qbr %[p3], %[tn2] \n\t" 247 "preceu.ph.qbl %[p4], %[tn2] \n\t" 248 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t" 249 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t" 250 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t" 251 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t" 252 "extp %[Temp2], $ac3, 31 \n\t" 253 254 "lbu %[tp1], 6(%[dst]) \n\t" 255 256 /* odd 2. pixel */ 257 "mtlo %[vector4a], $ac3 \n\t" 258 "mthi $zero, $ac3 \n\t" 259 "mtlo %[vector4a], $ac2 \n\t" 260 "mthi $zero, $ac2 \n\t" 261 "preceu.ph.qbr %[p1], %[tn1] \n\t" 262 "preceu.ph.qbl %[n1], %[tn1] \n\t" 263 "lbux %[st0], %[Temp3](%[cm]) \n\t" 264 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t" 265 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t" 266 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t" 267 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t" 268 "extp %[Temp3], $ac1, 31 \n\t" 269 270 "lbu %[tp2], 1(%[dst]) \n\t" 271 "lbu %[tn2], 3(%[dst]) \n\t" 272 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t" 273 274 /* odd 3. pixel */ 275 "lbux %[st1], %[Temp2](%[cm]) \n\t" 276 "preceu.ph.qbr %[p2], %[tn3] \n\t" 277 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t" 278 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t" 279 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t" 280 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t" 281 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t" 282 "extp %[Temp2], $ac3, 31 \n\t" 283 284 "lbu %[tn3], 5(%[dst]) \n\t" 285 286 /* odd 4. pixel */ 287 "sb %[tp2], 1(%[dst]) \n\t" 288 "sb %[tp1], 6(%[dst]) \n\t" 289 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t" 290 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t" 291 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t" 292 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t" 293 "extp %[Temp1], $ac2, 31 \n\t" 294 295 "lbu %[tn1], 7(%[dst]) \n\t" 296 297 /* clamp */ 298 "lbux %[p4], %[Temp3](%[cm]) \n\t" 299 "addqh_r.w %[tn2], %[tn2], %[p4] \n\t" 300 301 "lbux %[p2], %[Temp2](%[cm]) \n\t" 302 "addqh_r.w %[tn3], %[tn3], %[p2] \n\t" 303 304 "lbux %[n1], %[Temp1](%[cm]) \n\t" 305 "addqh_r.w %[tn1], %[tn1], %[n1] \n\t" 306 307 /* store bytes */ 308 "sb %[tn2], 3(%[dst]) \n\t" 309 "sb %[tn3], 5(%[dst]) \n\t" 310 "sb %[tn1], 7(%[dst]) \n\t" 311 312 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), 313 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3), 314 [st0] "=&r" (st0), [st1] "=&r" (st1), 315 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 316 [n1] "=&r" (n1), 317 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 318 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b), 319 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b), 320 [vector4a] "r" (vector4a), 321 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 322 ); 323 324 /* Next row... */ 325 src += src_stride; 326 dst += dst_stride; 327 } 328} 329 330static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr, 331 int32_t src_stride, 332 uint8_t *dst_ptr, 333 int32_t dst_stride, 334 const int16_t *filter_x0, 335 int32_t h, 336 int32_t count) { 337 int32_t y, c; 338 const uint8_t *src; 339 uint8_t *dst; 340 uint8_t *cm = vpx_ff_cropTbl; 341 uint32_t vector_64 = 64; 342 int32_t filter12, filter34, filter56, filter78; 343 int32_t Temp1, Temp2, Temp3; 344 uint32_t qload1, qload2, qload3; 345 uint32_t p1, p2, p3, p4, p5; 346 uint32_t st1, st2, st3; 347 348 filter12 = ((const int32_t *)filter_x0)[0]; 349 filter34 = ((const int32_t *)filter_x0)[1]; 350 filter56 = ((const int32_t *)filter_x0)[2]; 351 filter78 = ((const int32_t *)filter_x0)[3]; 352 353 for (y = h; y--;) { 354 src = src_ptr; 355 dst = dst_ptr; 356 357 /* prefetch data to cache memory */ 358 prefetch_load(src_ptr + src_stride); 359 prefetch_load(src_ptr + src_stride + 32); 360 prefetch_store(dst_ptr + dst_stride); 361 362 for (c = 0; c < count; c++) { 363 __asm__ __volatile__ ( 364 "ulw %[qload1], 0(%[src]) \n\t" 365 "ulw %[qload2], 4(%[src]) \n\t" 366 367 /* even 1. pixel */ 368 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 369 "mthi $zero, $ac1 \n\t" 370 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 371 "mthi $zero, $ac2 \n\t" 372 "preceu.ph.qbr %[p1], %[qload1] \n\t" 373 "preceu.ph.qbl %[p2], %[qload1] \n\t" 374 "preceu.ph.qbr %[p3], %[qload2] \n\t" 375 "preceu.ph.qbl %[p4], %[qload2] \n\t" 376 "ulw %[qload3], 8(%[src]) \n\t" 377 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 378 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 379 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 380 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 381 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 382 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 383 384 /* even 2. pixel */ 385 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 386 "mthi $zero, $ac3 \n\t" 387 "preceu.ph.qbr %[p1], %[qload3] \n\t" 388 "preceu.ph.qbl %[p5], %[qload3] \n\t" 389 "ulw %[qload1], 12(%[src]) \n\t" 390 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 391 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 392 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 393 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 394 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 395 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 396 397 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 398 399 /* even 3. pixel */ 400 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 401 "mthi $zero, $ac1 \n\t" 402 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 403 "preceu.ph.qbr %[p2], %[qload1] \n\t" 404 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 405 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 406 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 407 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 408 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 409 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 410 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 411 412 /* even 4. pixel */ 413 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 414 "mthi $zero, $ac2 \n\t" 415 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 416 "preceu.ph.qbl %[p3], %[qload1] \n\t" 417 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 418 "ulw %[qload2], 16(%[src]) \n\t" 419 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 420 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 421 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 422 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 423 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 424 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 425 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 426 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 427 428 /* even 5. pixel */ 429 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 430 "mthi $zero, $ac3 \n\t" 431 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 432 "preceu.ph.qbr %[p4], %[qload2] \n\t" 433 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 434 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 435 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 436 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 437 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 438 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 439 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 440 441 /* even 6. pixel */ 442 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 443 "mthi $zero, $ac1 \n\t" 444 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 445 "preceu.ph.qbl %[p1], %[qload2] \n\t" 446 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 447 "ulw %[qload3], 20(%[src]) \n\t" 448 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 449 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 450 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 451 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 452 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 453 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 454 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 455 456 /* even 7. pixel */ 457 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 458 "mthi $zero, $ac2 \n\t" 459 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 460 "preceu.ph.qbr %[p5], %[qload3] \n\t" 461 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 462 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 463 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 464 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 465 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 466 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 467 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 468 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 469 470 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 471 472 /* even 8. pixel */ 473 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 474 "mthi $zero, $ac3 \n\t" 475 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 476 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 477 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 478 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 479 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 480 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 481 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 482 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 483 484 /* ODD pixels */ 485 "ulw %[qload1], 1(%[src]) \n\t" 486 "ulw %[qload2], 5(%[src]) \n\t" 487 488 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 489 490 /* odd 1. pixel */ 491 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 492 "mthi $zero, $ac1 \n\t" 493 "preceu.ph.qbr %[p1], %[qload1] \n\t" 494 "preceu.ph.qbl %[p2], %[qload1] \n\t" 495 "preceu.ph.qbr %[p3], %[qload2] \n\t" 496 "preceu.ph.qbl %[p4], %[qload2] \n\t" 497 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 498 "ulw %[qload3], 9(%[src]) \n\t" 499 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 500 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 501 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 502 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 503 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 504 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 505 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 506 507 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 508 509 /* odd 2. pixel */ 510 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 511 "mthi $zero, $ac2 \n\t" 512 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 513 "preceu.ph.qbr %[p1], %[qload3] \n\t" 514 "preceu.ph.qbl %[p5], %[qload3] \n\t" 515 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 516 "ulw %[qload1], 13(%[src]) \n\t" 517 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 518 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 519 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 520 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 521 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 522 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 523 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 524 525 /* odd 3. pixel */ 526 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 527 "mthi $zero, $ac3 \n\t" 528 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 529 "preceu.ph.qbr %[p2], %[qload1] \n\t" 530 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 531 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 532 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 533 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 534 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 535 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 536 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 537 538 /* odd 4. pixel */ 539 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 540 "mthi $zero, $ac1 \n\t" 541 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 542 "preceu.ph.qbl %[p3], %[qload1] \n\t" 543 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 544 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 545 "ulw %[qload2], 17(%[src]) \n\t" 546 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 547 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 548 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 549 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 550 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 551 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 552 553 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 554 555 /* odd 5. pixel */ 556 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 557 "mthi $zero, $ac2 \n\t" 558 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 559 "preceu.ph.qbr %[p4], %[qload2] \n\t" 560 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 561 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 562 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 563 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 564 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 565 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 566 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 567 568 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 569 570 /* odd 6. pixel */ 571 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 572 "mthi $zero, $ac3 \n\t" 573 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 574 "preceu.ph.qbl %[p1], %[qload2] \n\t" 575 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 576 "ulw %[qload3], 21(%[src]) \n\t" 577 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 578 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 579 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 580 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 581 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 582 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 583 584 /* odd 7. pixel */ 585 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 586 "mthi $zero, $ac1 \n\t" 587 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 588 "preceu.ph.qbr %[p5], %[qload3] \n\t" 589 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 590 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 591 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 592 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 593 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 594 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 595 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 596 597 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 598 599 /* odd 8. pixel */ 600 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 601 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 602 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 603 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 604 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 605 606 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 607 608 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 609 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 610 611 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 612 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 613 614 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 615 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 616 617 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 618 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 619 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 620 621 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), 622 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 623 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 624 [qload3] "=&r" (qload3), [p5] "=&r" (p5), 625 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 626 : [filter12] "r" (filter12), [filter34] "r" (filter34), 627 [filter56] "r" (filter56), [filter78] "r" (filter78), 628 [vector_64] "r" (vector_64), 629 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 630 ); 631 632 src += 16; 633 dst += 16; 634 } 635 636 /* Next row... */ 637 src_ptr += src_stride; 638 dst_ptr += dst_stride; 639 } 640} 641 642static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, 643 int32_t src_stride, 644 uint8_t *dst_ptr, 645 int32_t dst_stride, 646 const int16_t *filter_x0, 647 int32_t h) { 648 int32_t y, c; 649 const uint8_t *src; 650 uint8_t *dst; 651 uint8_t *cm = vpx_ff_cropTbl; 652 uint32_t vector_64 = 64; 653 int32_t filter12, filter34, filter56, filter78; 654 int32_t Temp1, Temp2, Temp3; 655 uint32_t qload1, qload2, qload3; 656 uint32_t p1, p2, p3, p4, p5; 657 uint32_t st1, st2, st3; 658 659 filter12 = ((const int32_t *)filter_x0)[0]; 660 filter34 = ((const int32_t *)filter_x0)[1]; 661 filter56 = ((const int32_t *)filter_x0)[2]; 662 filter78 = ((const int32_t *)filter_x0)[3]; 663 664 for (y = h; y--;) { 665 src = src_ptr; 666 dst = dst_ptr; 667 668 /* prefetch data to cache memory */ 669 prefetch_load(src_ptr + src_stride); 670 prefetch_load(src_ptr + src_stride + 32); 671 prefetch_load(src_ptr + src_stride + 64); 672 prefetch_store(dst_ptr + dst_stride); 673 prefetch_store(dst_ptr + dst_stride + 32); 674 675 for (c = 0; c < 4; c++) { 676 __asm__ __volatile__ ( 677 "ulw %[qload1], 0(%[src]) \n\t" 678 "ulw %[qload2], 4(%[src]) \n\t" 679 680 /* even 1. pixel */ 681 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */ 682 "mthi $zero, $ac1 \n\t" 683 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */ 684 "mthi $zero, $ac2 \n\t" 685 "preceu.ph.qbr %[p1], %[qload1] \n\t" 686 "preceu.ph.qbl %[p2], %[qload1] \n\t" 687 "preceu.ph.qbr %[p3], %[qload2] \n\t" 688 "preceu.ph.qbl %[p4], %[qload2] \n\t" 689 "ulw %[qload3], 8(%[src]) \n\t" 690 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */ 691 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */ 692 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */ 693 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */ 694 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */ 695 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */ 696 697 /* even 2. pixel */ 698 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */ 699 "mthi $zero, $ac3 \n\t" 700 "preceu.ph.qbr %[p1], %[qload3] \n\t" 701 "preceu.ph.qbl %[p5], %[qload3] \n\t" 702 "ulw %[qload1], 12(%[src]) \n\t" 703 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */ 704 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */ 705 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */ 706 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */ 707 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */ 708 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */ 709 710 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */ 711 712 /* even 3. pixel */ 713 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */ 714 "mthi $zero, $ac1 \n\t" 715 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */ 716 "preceu.ph.qbr %[p2], %[qload1] \n\t" 717 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */ 718 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */ 719 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */ 720 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */ 721 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */ 722 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */ 723 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */ 724 725 /* even 4. pixel */ 726 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */ 727 "mthi $zero, $ac2 \n\t" 728 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */ 729 "preceu.ph.qbl %[p3], %[qload1] \n\t" 730 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */ 731 "ulw %[qload2], 16(%[src]) \n\t" 732 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */ 733 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */ 734 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */ 735 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */ 736 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */ 737 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */ 738 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */ 739 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */ 740 741 /* even 5. pixel */ 742 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */ 743 "mthi $zero, $ac3 \n\t" 744 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */ 745 "preceu.ph.qbr %[p4], %[qload2] \n\t" 746 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */ 747 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */ 748 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */ 749 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */ 750 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */ 751 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */ 752 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */ 753 754 /* even 6. pixel */ 755 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */ 756 "mthi $zero, $ac1 \n\t" 757 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */ 758 "preceu.ph.qbl %[p1], %[qload2] \n\t" 759 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */ 760 "ulw %[qload3], 20(%[src]) \n\t" 761 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */ 762 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */ 763 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */ 764 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */ 765 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */ 766 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */ 767 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */ 768 769 /* even 7. pixel */ 770 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */ 771 "mthi $zero, $ac2 \n\t" 772 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */ 773 "preceu.ph.qbr %[p5], %[qload3] \n\t" 774 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */ 775 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */ 776 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */ 777 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */ 778 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */ 779 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */ 780 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */ 781 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */ 782 783 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */ 784 785 /* even 8. pixel */ 786 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */ 787 "mthi $zero, $ac3 \n\t" 788 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */ 789 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */ 790 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */ 791 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */ 792 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */ 793 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */ 794 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */ 795 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */ 796 797 /* ODD pixels */ 798 "ulw %[qload1], 1(%[src]) \n\t" 799 "ulw %[qload2], 5(%[src]) \n\t" 800 801 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */ 802 803 /* odd 1. pixel */ 804 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */ 805 "mthi $zero, $ac1 \n\t" 806 "preceu.ph.qbr %[p1], %[qload1] \n\t" 807 "preceu.ph.qbl %[p2], %[qload1] \n\t" 808 "preceu.ph.qbr %[p3], %[qload2] \n\t" 809 "preceu.ph.qbl %[p4], %[qload2] \n\t" 810 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */ 811 "ulw %[qload3], 9(%[src]) \n\t" 812 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */ 813 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */ 814 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */ 815 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */ 816 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */ 817 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */ 818 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */ 819 820 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */ 821 822 /* odd 2. pixel */ 823 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */ 824 "mthi $zero, $ac2 \n\t" 825 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */ 826 "preceu.ph.qbr %[p1], %[qload3] \n\t" 827 "preceu.ph.qbl %[p5], %[qload3] \n\t" 828 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */ 829 "ulw %[qload1], 13(%[src]) \n\t" 830 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */ 831 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */ 832 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */ 833 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */ 834 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */ 835 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */ 836 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */ 837 838 /* odd 3. pixel */ 839 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */ 840 "mthi $zero, $ac3 \n\t" 841 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */ 842 "preceu.ph.qbr %[p2], %[qload1] \n\t" 843 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */ 844 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */ 845 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */ 846 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */ 847 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */ 848 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */ 849 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */ 850 851 /* odd 4. pixel */ 852 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */ 853 "mthi $zero, $ac1 \n\t" 854 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */ 855 "preceu.ph.qbl %[p3], %[qload1] \n\t" 856 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */ 857 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */ 858 "ulw %[qload2], 17(%[src]) \n\t" 859 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */ 860 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */ 861 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */ 862 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */ 863 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */ 864 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */ 865 866 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */ 867 868 /* odd 5. pixel */ 869 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */ 870 "mthi $zero, $ac2 \n\t" 871 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */ 872 "preceu.ph.qbr %[p4], %[qload2] \n\t" 873 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */ 874 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */ 875 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */ 876 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */ 877 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */ 878 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */ 879 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */ 880 881 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */ 882 883 /* odd 6. pixel */ 884 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */ 885 "mthi $zero, $ac3 \n\t" 886 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */ 887 "preceu.ph.qbl %[p1], %[qload2] \n\t" 888 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */ 889 "ulw %[qload3], 21(%[src]) \n\t" 890 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */ 891 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */ 892 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */ 893 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */ 894 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */ 895 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */ 896 897 /* odd 7. pixel */ 898 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */ 899 "mthi $zero, $ac1 \n\t" 900 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */ 901 "preceu.ph.qbr %[p5], %[qload3] \n\t" 902 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */ 903 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */ 904 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */ 905 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */ 906 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */ 907 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */ 908 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */ 909 910 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */ 911 912 /* odd 8. pixel */ 913 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */ 914 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */ 915 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */ 916 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */ 917 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */ 918 919 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */ 920 921 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */ 922 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */ 923 924 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */ 925 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */ 926 927 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */ 928 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */ 929 930 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */ 931 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */ 932 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */ 933 934 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), 935 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3), 936 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4), 937 [qload3] "=&r" (qload3), [p5] "=&r" (p5), 938 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3) 939 : [filter12] "r" (filter12), [filter34] "r" (filter34), 940 [filter56] "r" (filter56), [filter78] "r" (filter78), 941 [vector_64] "r" (vector_64), 942 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src) 943 ); 944 945 src += 16; 946 dst += 16; 947 } 948 949 /* Next row... */ 950 src_ptr += src_stride; 951 dst_ptr += dst_stride; 952 } 953} 954 955void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, 956 uint8_t *dst, ptrdiff_t dst_stride, 957 const int16_t *filter_x, int x_step_q4, 958 const int16_t *filter_y, int y_step_q4, 959 int w, int h) { 960 assert(x_step_q4 == 16); 961 assert(((const int32_t *)filter_x)[1] != 0x800000); 962 963 if (((const int32_t *)filter_x)[0] == 0) { 964 vpx_convolve2_avg_horiz_dspr2(src, src_stride, 965 dst, dst_stride, 966 filter_x, x_step_q4, 967 filter_y, y_step_q4, 968 w, h); 969 } else { 970 uint32_t pos = 38; 971 972 src -= 3; 973 974 /* bit positon for extract from acc */ 975 __asm__ __volatile__ ( 976 "wrdsp %[pos], 1 \n\t" 977 : 978 : [pos] "r" (pos) 979 ); 980 981 /* prefetch data to cache memory */ 982 prefetch_load(src); 983 prefetch_load(src + 32); 984 prefetch_store(dst); 985 986 switch (w) { 987 case 4: 988 convolve_avg_horiz_4_dspr2(src, src_stride, 989 dst, dst_stride, 990 filter_x, h); 991 break; 992 case 8: 993 convolve_avg_horiz_8_dspr2(src, src_stride, 994 dst, dst_stride, 995 filter_x, h); 996 break; 997 case 16: 998 convolve_avg_horiz_16_dspr2(src, src_stride, 999 dst, dst_stride, 1000 filter_x, h, 1); 1001 break; 1002 case 32: 1003 convolve_avg_horiz_16_dspr2(src, src_stride, 1004 dst, dst_stride, 1005 filter_x, h, 2); 1006 break; 1007 case 64: 1008 prefetch_load(src + 64); 1009 prefetch_store(dst + 32); 1010 1011 convolve_avg_horiz_64_dspr2(src, src_stride, 1012 dst, dst_stride, 1013 filter_x, h); 1014 break; 1015 default: 1016 vpx_convolve8_avg_horiz_c(src + 3, src_stride, 1017 dst, dst_stride, 1018 filter_x, x_step_q4, 1019 filter_y, y_step_q4, 1020 w, h); 1021 break; 1022 } 1023 } 1024} 1025#endif 1026