loopfilter_mb_dspr2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <stdlib.h> 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx/vpx_integer.h" 15#include "vpx_dsp/mips/common_dspr2.h" 16#include "vpx_dsp/mips/loopfilter_filters_dspr2.h" 17#include "vpx_dsp/mips/loopfilter_macros_dspr2.h" 18#include "vpx_dsp/mips/loopfilter_masks_dspr2.h" 19#include "vpx_mem/vpx_mem.h" 20 21#if HAVE_DSPR2 22void vpx_lpf_horizontal_8_dspr2(unsigned char *s, 23 int pitch, 24 const uint8_t *blimit, 25 const uint8_t *limit, 26 const uint8_t *thresh, 27 int count) { 28 uint32_t mask; 29 uint32_t hev, flat; 30 uint8_t i; 31 uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3; 32 uint32_t thresh_vec, flimit_vec, limit_vec; 33 uint32_t uflimit, ulimit, uthresh; 34 uint32_t p1_f0, p0_f0, q0_f0, q1_f0; 35 uint32_t p3, p2, p1, p0, q0, q1, q2, q3; 36 uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; 37 uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; 38 39 uflimit = *blimit; 40 ulimit = *limit; 41 uthresh = *thresh; 42 43 /* create quad-byte */ 44 __asm__ __volatile__ ( 45 "replv.qb %[thresh_vec], %[uthresh] \n\t" 46 "replv.qb %[flimit_vec], %[uflimit] \n\t" 47 "replv.qb %[limit_vec], %[ulimit] \n\t" 48 49 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), 50 [limit_vec] "=r" (limit_vec) 51 : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) 52 ); 53 54 /* prefetch data for store */ 55 prefetch_store(s); 56 57 for (i = 0; i < 2; i++) { 58 sp3 = s - (pitch << 2); 59 sp2 = sp3 + pitch; 60 sp1 = sp2 + pitch; 61 sp0 = sp1 + pitch; 62 sq0 = s; 63 sq1 = s + pitch; 64 sq2 = sq1 + pitch; 65 sq3 = sq2 + pitch; 66 67 __asm__ __volatile__ ( 68 "lw %[p3], (%[sp3]) \n\t" 69 "lw %[p2], (%[sp2]) \n\t" 70 "lw %[p1], (%[sp1]) \n\t" 71 "lw %[p0], (%[sp0]) \n\t" 72 "lw %[q0], (%[sq0]) \n\t" 73 "lw %[q1], (%[sq1]) \n\t" 74 "lw %[q2], (%[sq2]) \n\t" 75 "lw %[q3], (%[sq3]) \n\t" 76 77 : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), 78 [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0) 79 : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), 80 [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0) 81 ); 82 83 filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, 84 p1, p0, p3, p2, q0, q1, q2, q3, 85 &hev, &mask, &flat); 86 87 if ((flat == 0) && (mask != 0)) { 88 filter1_dspr2(mask, hev, p1, p0, q0, q1, 89 &p1_f0, &p0_f0, &q0_f0, &q1_f0); 90 91 __asm__ __volatile__ ( 92 "sw %[p1_f0], (%[sp1]) \n\t" 93 "sw %[p0_f0], (%[sp0]) \n\t" 94 "sw %[q0_f0], (%[sq0]) \n\t" 95 "sw %[q1_f0], (%[sq1]) \n\t" 96 97 : 98 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), 99 [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), 100 [sp1] "r" (sp1), [sp0] "r" (sp0), 101 [sq0] "r" (sq0), [sq1] "r" (sq1) 102 ); 103 } else if ((mask & flat) == 0xFFFFFFFF) { 104 /* left 2 element operation */ 105 PACK_LEFT_0TO3() 106 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, 107 &q0_l, &q1_l, &q2_l, &q3_l); 108 109 /* right 2 element operation */ 110 PACK_RIGHT_0TO3() 111 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, 112 &q0_r, &q1_r, &q2_r, &q3_r); 113 114 COMBINE_LEFT_RIGHT_0TO2() 115 116 __asm__ __volatile__ ( 117 "sw %[p2], (%[sp2]) \n\t" 118 "sw %[p1], (%[sp1]) \n\t" 119 "sw %[p0], (%[sp0]) \n\t" 120 "sw %[q0], (%[sq0]) \n\t" 121 "sw %[q1], (%[sq1]) \n\t" 122 "sw %[q2], (%[sq2]) \n\t" 123 124 : 125 : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0), 126 [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), 127 [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), 128 [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) 129 ); 130 } else if ((flat != 0) && (mask != 0)) { 131 /* filtering */ 132 filter1_dspr2(mask, hev, p1, p0, q0, q1, 133 &p1_f0, &p0_f0, &q0_f0, &q1_f0); 134 135 /* left 2 element operation */ 136 PACK_LEFT_0TO3() 137 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, 138 &q0_l, &q1_l, &q2_l, &q3_l); 139 140 /* right 2 element operation */ 141 PACK_RIGHT_0TO3() 142 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, 143 &q0_r, &q1_r, &q2_r, &q3_r); 144 145 if (mask & flat & 0x000000FF) { 146 __asm__ __volatile__ ( 147 "sb %[p2_r], (%[sp2]) \n\t" 148 "sb %[p1_r], (%[sp1]) \n\t" 149 "sb %[p0_r], (%[sp0]) \n\t" 150 "sb %[q0_r], (%[sq0]) \n\t" 151 "sb %[q1_r], (%[sq1]) \n\t" 152 "sb %[q2_r], (%[sq2]) \n\t" 153 154 : 155 : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), 156 [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), 157 [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), 158 [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) 159 ); 160 } else if (mask & 0x000000FF) { 161 __asm__ __volatile__ ( 162 "sb %[p1_f0], (%[sp1]) \n\t" 163 "sb %[p0_f0], (%[sp0]) \n\t" 164 "sb %[q0_f0], (%[sq0]) \n\t" 165 "sb %[q1_f0], (%[sq1]) \n\t" 166 167 : 168 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), 169 [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), 170 [sp1] "r" (sp1), [sp0] "r" (sp0), 171 [sq0] "r" (sq0), [sq1] "r" (sq1) 172 ); 173 } 174 175 __asm__ __volatile__ ( 176 "srl %[p2_r], %[p2_r], 16 \n\t" 177 "srl %[p1_r], %[p1_r], 16 \n\t" 178 "srl %[p0_r], %[p0_r], 16 \n\t" 179 "srl %[q0_r], %[q0_r], 16 \n\t" 180 "srl %[q1_r], %[q1_r], 16 \n\t" 181 "srl %[q2_r], %[q2_r], 16 \n\t" 182 "srl %[p1_f0], %[p1_f0], 8 \n\t" 183 "srl %[p0_f0], %[p0_f0], 8 \n\t" 184 "srl %[q0_f0], %[q0_f0], 8 \n\t" 185 "srl %[q1_f0], %[q1_f0], 8 \n\t" 186 187 : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), 188 [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), 189 [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), 190 [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) 191 : 192 ); 193 194 if (mask & flat & 0x0000FF00) { 195 __asm__ __volatile__ ( 196 "sb %[p2_r], +1(%[sp2]) \n\t" 197 "sb %[p1_r], +1(%[sp1]) \n\t" 198 "sb %[p0_r], +1(%[sp0]) \n\t" 199 "sb %[q0_r], +1(%[sq0]) \n\t" 200 "sb %[q1_r], +1(%[sq1]) \n\t" 201 "sb %[q2_r], +1(%[sq2]) \n\t" 202 203 : 204 : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), 205 [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), 206 [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), 207 [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) 208 ); 209 } else if (mask & 0x0000FF00) { 210 __asm__ __volatile__ ( 211 "sb %[p1_f0], +1(%[sp1]) \n\t" 212 "sb %[p0_f0], +1(%[sp0]) \n\t" 213 "sb %[q0_f0], +1(%[sq0]) \n\t" 214 "sb %[q1_f0], +1(%[sq1]) \n\t" 215 216 : 217 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), 218 [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), 219 [sp1] "r" (sp1), [sp0] "r" (sp0), 220 [sq0] "r" (sq0), [sq1] "r" (sq1) 221 ); 222 } 223 224 __asm__ __volatile__ ( 225 "srl %[p1_f0], %[p1_f0], 8 \n\t" 226 "srl %[p0_f0], %[p0_f0], 8 \n\t" 227 "srl %[q0_f0], %[q0_f0], 8 \n\t" 228 "srl %[q1_f0], %[q1_f0], 8 \n\t" 229 230 : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), 231 [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2), 232 [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), 233 [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) 234 : 235 ); 236 237 if (mask & flat & 0x00FF0000) { 238 __asm__ __volatile__ ( 239 "sb %[p2_l], +2(%[sp2]) \n\t" 240 "sb %[p1_l], +2(%[sp1]) \n\t" 241 "sb %[p0_l], +2(%[sp0]) \n\t" 242 "sb %[q0_l], +2(%[sq0]) \n\t" 243 "sb %[q1_l], +2(%[sq1]) \n\t" 244 "sb %[q2_l], +2(%[sq2]) \n\t" 245 246 : 247 : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), 248 [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), 249 [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), 250 [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) 251 ); 252 } else if (mask & 0x00FF0000) { 253 __asm__ __volatile__ ( 254 "sb %[p1_f0], +2(%[sp1]) \n\t" 255 "sb %[p0_f0], +2(%[sp0]) \n\t" 256 "sb %[q0_f0], +2(%[sq0]) \n\t" 257 "sb %[q1_f0], +2(%[sq1]) \n\t" 258 259 : 260 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), 261 [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), 262 [sp1] "r" (sp1), [sp0] "r" (sp0), 263 [sq0] "r" (sq0), [sq1] "r" (sq1) 264 ); 265 } 266 267 __asm__ __volatile__ ( 268 "srl %[p2_l], %[p2_l], 16 \n\t" 269 "srl %[p1_l], %[p1_l], 16 \n\t" 270 "srl %[p0_l], %[p0_l], 16 \n\t" 271 "srl %[q0_l], %[q0_l], 16 \n\t" 272 "srl %[q1_l], %[q1_l], 16 \n\t" 273 "srl %[q2_l], %[q2_l], 16 \n\t" 274 "srl %[p1_f0], %[p1_f0], 8 \n\t" 275 "srl %[p0_f0], %[p0_f0], 8 \n\t" 276 "srl %[q0_f0], %[q0_f0], 8 \n\t" 277 "srl %[q1_f0], %[q1_f0], 8 \n\t" 278 279 : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), 280 [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), 281 [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), 282 [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) 283 : 284 ); 285 286 if (mask & flat & 0xFF000000) { 287 __asm__ __volatile__ ( 288 "sb %[p2_l], +3(%[sp2]) \n\t" 289 "sb %[p1_l], +3(%[sp1]) \n\t" 290 "sb %[p0_l], +3(%[sp0]) \n\t" 291 "sb %[q0_l], +3(%[sq0]) \n\t" 292 "sb %[q1_l], +3(%[sq1]) \n\t" 293 "sb %[q2_l], +3(%[sq2]) \n\t" 294 295 : 296 : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), 297 [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), 298 [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0), 299 [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2) 300 ); 301 } else if (mask & 0xFF000000) { 302 __asm__ __volatile__ ( 303 "sb %[p1_f0], +3(%[sp1]) \n\t" 304 "sb %[p0_f0], +3(%[sp0]) \n\t" 305 "sb %[q0_f0], +3(%[sq0]) \n\t" 306 "sb %[q1_f0], +3(%[sq1]) \n\t" 307 308 : 309 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), 310 [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), 311 [sp1] "r" (sp1), [sp0] "r" (sp0), 312 [sq0] "r" (sq0), [sq1] "r" (sq1) 313 ); 314 } 315 } 316 317 s = s + 4; 318 } 319} 320 321void vpx_lpf_vertical_8_dspr2(unsigned char *s, 322 int pitch, 323 const uint8_t *blimit, 324 const uint8_t *limit, 325 const uint8_t *thresh, 326 int count) { 327 uint8_t i; 328 uint32_t mask, hev, flat; 329 uint8_t *s1, *s2, *s3, *s4; 330 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; 331 uint32_t thresh_vec, flimit_vec, limit_vec; 332 uint32_t uflimit, ulimit, uthresh; 333 uint32_t p3, p2, p1, p0, q3, q2, q1, q0; 334 uint32_t p1_f0, p0_f0, q0_f0, q1_f0; 335 uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l; 336 uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r; 337 338 uflimit = *blimit; 339 ulimit = *limit; 340 uthresh = *thresh; 341 342 /* create quad-byte */ 343 __asm__ __volatile__ ( 344 "replv.qb %[thresh_vec], %[uthresh] \n\t" 345 "replv.qb %[flimit_vec], %[uflimit] \n\t" 346 "replv.qb %[limit_vec], %[ulimit] \n\t" 347 348 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), 349 [limit_vec] "=r" (limit_vec) 350 : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) 351 ); 352 353 prefetch_store(s + pitch); 354 355 for (i = 0; i < 2; i++) { 356 s1 = s; 357 s2 = s + pitch; 358 s3 = s2 + pitch; 359 s4 = s3 + pitch; 360 s = s4 + pitch; 361 362 __asm__ __volatile__ ( 363 "lw %[p0], -4(%[s1]) \n\t" 364 "lw %[p1], -4(%[s2]) \n\t" 365 "lw %[p2], -4(%[s3]) \n\t" 366 "lw %[p3], -4(%[s4]) \n\t" 367 "lw %[q3], (%[s1]) \n\t" 368 "lw %[q2], (%[s2]) \n\t" 369 "lw %[q1], (%[s3]) \n\t" 370 "lw %[q0], (%[s4]) \n\t" 371 372 : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0), 373 [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3) 374 : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) 375 ); 376 377 /* transpose p3, p2, p1, p0 378 original (when loaded from memory) 379 register -4 -3 -2 -1 380 p0 p0_0 p0_1 p0_2 p0_3 381 p1 p1_0 p1_1 p1_2 p1_3 382 p2 p2_0 p2_1 p2_2 p2_3 383 p3 p3_0 p3_1 p3_2 p3_3 384 385 after transpose 386 register 387 p0 p3_3 p2_3 p1_3 p0_3 388 p1 p3_2 p2_2 p1_2 p0_2 389 p2 p3_1 p2_1 p1_1 p0_1 390 p3 p3_0 p2_0 p1_0 p0_0 391 */ 392 __asm__ __volatile__ ( 393 "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t" 394 "precr.qb.ph %[prim2], %[p0], %[p1] \n\t" 395 "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t" 396 "precr.qb.ph %[prim4], %[p2], %[p3] \n\t" 397 398 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 399 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 400 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 401 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 402 403 "precrq.ph.w %[p0], %[p1], %[sec3] \n\t" 404 "precrq.ph.w %[p2], %[p3], %[sec4] \n\t" 405 "append %[p1], %[sec3], 16 \n\t" 406 "append %[p3], %[sec4], 16 \n\t" 407 408 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), 409 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), 410 [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), 411 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) 412 : 413 ); 414 415 /* transpose q0, q1, q2, q3 416 original (when loaded from memory) 417 register +1 +2 +3 +4 418 q3 q3_0 q3_1 q3_2 q3_3 419 q2 q2_0 q2_1 q2_2 q2_3 420 q1 q1_0 q1_1 q1_2 q1_3 421 q0 q0_0 q0_1 q0_2 q0_3 422 423 after transpose 424 register 425 q3 q0_3 q1_3 q2_3 q3_3 426 q2 q0_2 q1_2 q2_2 q3_2 427 q1 q0_1 q1_1 q2_1 q3_1 428 q0 q0_0 q1_0 q2_0 q3_0 429 */ 430 __asm__ __volatile__ ( 431 "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t" 432 "precr.qb.ph %[prim2], %[q3], %[q2] \n\t" 433 "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t" 434 "precr.qb.ph %[prim4], %[q1], %[q0] \n\t" 435 436 "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t" 437 "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t" 438 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 439 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 440 441 "precrq.ph.w %[q3], %[q2], %[sec3] \n\t" 442 "precrq.ph.w %[q1], %[q0], %[sec4] \n\t" 443 "append %[q2], %[sec3], 16 \n\t" 444 "append %[q0], %[sec4], 16 \n\t" 445 446 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), 447 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), 448 [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0), 449 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) 450 : 451 ); 452 453 filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, 454 p1, p0, p3, p2, q0, q1, q2, q3, 455 &hev, &mask, &flat); 456 457 if ((flat == 0) && (mask != 0)) { 458 filter1_dspr2(mask, hev, p1, p0, q0, q1, 459 &p1_f0, &p0_f0, &q0_f0, &q1_f0); 460 STORE_F0() 461 } else if ((mask & flat) == 0xFFFFFFFF) { 462 /* left 2 element operation */ 463 PACK_LEFT_0TO3() 464 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, 465 &q0_l, &q1_l, &q2_l, &q3_l); 466 467 /* right 2 element operation */ 468 PACK_RIGHT_0TO3() 469 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, 470 &q0_r, &q1_r, &q2_r, &q3_r); 471 472 STORE_F1() 473 } else if ((flat != 0) && (mask != 0)) { 474 filter1_dspr2(mask, hev, p1, p0, q0, q1, 475 &p1_f0, &p0_f0, &q0_f0, &q1_f0); 476 477 /* left 2 element operation */ 478 PACK_LEFT_0TO3() 479 mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, 480 &q0_l, &q1_l, &q2_l, &q3_l); 481 482 /* right 2 element operation */ 483 PACK_RIGHT_0TO3() 484 mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, 485 &q0_r, &q1_r, &q2_r, &q3_r); 486 487 if (mask & flat & 0x000000FF) { 488 __asm__ __volatile__ ( 489 "sb %[p2_r], -3(%[s4]) \n\t" 490 "sb %[p1_r], -2(%[s4]) \n\t" 491 "sb %[p0_r], -1(%[s4]) \n\t" 492 "sb %[q0_r], (%[s4]) \n\t" 493 "sb %[q1_r], +1(%[s4]) \n\t" 494 "sb %[q2_r], +2(%[s4]) \n\t" 495 496 : 497 : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), 498 [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), 499 [s4] "r" (s4) 500 ); 501 } else if (mask & 0x000000FF) { 502 __asm__ __volatile__ ( 503 "sb %[p1_f0], -2(%[s4]) \n\t" 504 "sb %[p0_f0], -1(%[s4]) \n\t" 505 "sb %[q0_f0], (%[s4]) \n\t" 506 "sb %[q1_f0], +1(%[s4]) \n\t" 507 508 : 509 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), 510 [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), 511 [s4] "r" (s4) 512 ); 513 } 514 515 __asm__ __volatile__ ( 516 "srl %[p2_r], %[p2_r], 16 \n\t" 517 "srl %[p1_r], %[p1_r], 16 \n\t" 518 "srl %[p0_r], %[p0_r], 16 \n\t" 519 "srl %[q0_r], %[q0_r], 16 \n\t" 520 "srl %[q1_r], %[q1_r], 16 \n\t" 521 "srl %[q2_r], %[q2_r], 16 \n\t" 522 "srl %[p1_f0], %[p1_f0], 8 \n\t" 523 "srl %[p0_f0], %[p0_f0], 8 \n\t" 524 "srl %[q0_f0], %[q0_f0], 8 \n\t" 525 "srl %[q1_f0], %[q1_f0], 8 \n\t" 526 527 : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r), 528 [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r), 529 [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), 530 [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) 531 : 532 ); 533 534 if (mask & flat & 0x0000FF00) { 535 __asm__ __volatile__ ( 536 "sb %[p2_r], -3(%[s3]) \n\t" 537 "sb %[p1_r], -2(%[s3]) \n\t" 538 "sb %[p0_r], -1(%[s3]) \n\t" 539 "sb %[q0_r], (%[s3]) \n\t" 540 "sb %[q1_r], +1(%[s3]) \n\t" 541 "sb %[q2_r], +2(%[s3]) \n\t" 542 543 : 544 : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r), 545 [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r), 546 [s3] "r" (s3) 547 ); 548 } else if (mask & 0x0000FF00) { 549 __asm__ __volatile__ ( 550 "sb %[p1_f0], -2(%[s3]) \n\t" 551 "sb %[p0_f0], -1(%[s3]) \n\t" 552 "sb %[q0_f0], (%[s3]) \n\t" 553 "sb %[q1_f0], +1(%[s3]) \n\t" 554 555 : 556 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), 557 [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), 558 [s3] "r" (s3) 559 ); 560 } 561 562 __asm__ __volatile__ ( 563 "srl %[p1_f0], %[p1_f0], 8 \n\t" 564 "srl %[p0_f0], %[p0_f0], 8 \n\t" 565 "srl %[q0_f0], %[q0_f0], 8 \n\t" 566 "srl %[q1_f0], %[q1_f0], 8 \n\t" 567 568 : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), 569 [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2), 570 [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), 571 [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) 572 : 573 ); 574 575 if (mask & flat & 0x00FF0000) { 576 __asm__ __volatile__ ( 577 "sb %[p2_l], -3(%[s2]) \n\t" 578 "sb %[p1_l], -2(%[s2]) \n\t" 579 "sb %[p0_l], -1(%[s2]) \n\t" 580 "sb %[q0_l], (%[s2]) \n\t" 581 "sb %[q1_l], +1(%[s2]) \n\t" 582 "sb %[q2_l], +2(%[s2]) \n\t" 583 584 : 585 : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), 586 [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), 587 [s2] "r" (s2) 588 ); 589 } else if (mask & 0x00FF0000) { 590 __asm__ __volatile__ ( 591 "sb %[p1_f0], -2(%[s2]) \n\t" 592 "sb %[p0_f0], -1(%[s2]) \n\t" 593 "sb %[q0_f0], (%[s2]) \n\t" 594 "sb %[q1_f0], +1(%[s2]) \n\t" 595 596 : 597 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), 598 [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0), 599 [s2] "r" (s2) 600 ); 601 } 602 603 __asm__ __volatile__ ( 604 "srl %[p2_l], %[p2_l], 16 \n\t" 605 "srl %[p1_l], %[p1_l], 16 \n\t" 606 "srl %[p0_l], %[p0_l], 16 \n\t" 607 "srl %[q0_l], %[q0_l], 16 \n\t" 608 "srl %[q1_l], %[q1_l], 16 \n\t" 609 "srl %[q2_l], %[q2_l], 16 \n\t" 610 "srl %[p1_f0], %[p1_f0], 8 \n\t" 611 "srl %[p0_f0], %[p0_f0], 8 \n\t" 612 "srl %[q0_f0], %[q0_f0], 8 \n\t" 613 "srl %[q1_f0], %[q1_f0], 8 \n\t" 614 615 : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l), 616 [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l), 617 [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0), 618 [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0) 619 : 620 ); 621 622 if (mask & flat & 0xFF000000) { 623 __asm__ __volatile__ ( 624 "sb %[p2_l], -3(%[s1]) \n\t" 625 "sb %[p1_l], -2(%[s1]) \n\t" 626 "sb %[p0_l], -1(%[s1]) \n\t" 627 "sb %[q0_l], (%[s1]) \n\t" 628 "sb %[q1_l], +1(%[s1]) \n\t" 629 "sb %[q2_l], +2(%[s1]) \n\t" 630 631 : 632 : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l), 633 [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l), 634 [s1] "r" (s1) 635 ); 636 } else if (mask & 0xFF000000) { 637 __asm__ __volatile__ ( 638 "sb %[p1_f0], -2(%[s1]) \n\t" 639 "sb %[p0_f0], -1(%[s1]) \n\t" 640 "sb %[q0_f0], (%[s1]) \n\t" 641 "sb %[q1_f0], +1(%[s1]) \n\t" 642 643 : 644 : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0), 645 [q1_f0] "r" (q1_f0), [s1] "r" (s1) 646 ); 647 } 648 } 649 } 650} 651#endif // #if HAVE_DSPR2 652