loopfilter_filters_dspr2.c revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <stdlib.h> 12 13#include "./vpx_dsp_rtcd.h" 14#include "vpx/vpx_integer.h" 15#include "vpx_dsp/mips/common_dspr2.h" 16#include "vpx_dsp/mips/loopfilter_filters_dspr2.h" 17#include "vpx_dsp/mips/loopfilter_macros_dspr2.h" 18#include "vpx_dsp/mips/loopfilter_masks_dspr2.h" 19#include "vpx_mem/vpx_mem.h" 20 21#if HAVE_DSPR2 22void vpx_lpf_horizontal_4_dspr2(unsigned char *s, 23 int pitch, 24 const uint8_t *blimit, 25 const uint8_t *limit, 26 const uint8_t *thresh, 27 int count) { 28 uint8_t i; 29 uint32_t mask; 30 uint32_t hev; 31 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 32 uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; 33 uint32_t thresh_vec, flimit_vec, limit_vec; 34 uint32_t uflimit, ulimit, uthresh; 35 36 uflimit = *blimit; 37 ulimit = *limit; 38 uthresh = *thresh; 39 40 /* create quad-byte */ 41 __asm__ __volatile__ ( 42 "replv.qb %[thresh_vec], %[uthresh] \n\t" 43 "replv.qb %[flimit_vec], %[uflimit] \n\t" 44 "replv.qb %[limit_vec], %[ulimit] \n\t" 45 46 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), 47 [limit_vec] "=r" (limit_vec) 48 : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) 49 ); 50 51 /* prefetch data for store */ 52 prefetch_store(s); 53 54 /* loop filter designed to work using chars so that we can make maximum use 55 of 8 bit simd instructions. */ 56 for (i = 0; i < 2; i++) { 57 sm1 = s - (pitch << 2); 58 s0 = sm1 + pitch; 59 s1 = s0 + pitch; 60 s2 = s - pitch; 61 s3 = s; 62 s4 = s + pitch; 63 s5 = s4 + pitch; 64 s6 = s5 + pitch; 65 66 __asm__ __volatile__ ( 67 "lw %[p1], (%[s1]) \n\t" 68 "lw %[p2], (%[s2]) \n\t" 69 "lw %[p3], (%[s3]) \n\t" 70 "lw %[p4], (%[s4]) \n\t" 71 72 : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4) 73 : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) 74 ); 75 76 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 77 mask will be zero and filtering is not needed */ 78 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 79 __asm__ __volatile__ ( 80 "lw %[pm1], (%[sm1]) \n\t" 81 "lw %[p0], (%[s0]) \n\t" 82 "lw %[p5], (%[s5]) \n\t" 83 "lw %[p6], (%[s6]) \n\t" 84 85 : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5), 86 [p6] "=&r" (p6) 87 : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6) 88 ); 89 90 filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, 91 pm1, p0, p3, p4, p5, p6, 92 thresh_vec, &hev, &mask); 93 94 /* if mask == 0 do filtering is not needed */ 95 if (mask) { 96 /* filtering */ 97 filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); 98 99 __asm__ __volatile__ ( 100 "sw %[p1], (%[s1]) \n\t" 101 "sw %[p2], (%[s2]) \n\t" 102 "sw %[p3], (%[s3]) \n\t" 103 "sw %[p4], (%[s4]) \n\t" 104 105 : 106 : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4), 107 [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) 108 ); 109 } 110 } 111 112 s = s + 4; 113 } 114} 115 116void vpx_lpf_vertical_4_dspr2(unsigned char *s, 117 int pitch, 118 const uint8_t *blimit, 119 const uint8_t *limit, 120 const uint8_t *thresh, 121 int count) { 122 uint8_t i; 123 uint32_t mask, hev; 124 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 125 uint8_t *s1, *s2, *s3, *s4; 126 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; 127 uint32_t thresh_vec, flimit_vec, limit_vec; 128 uint32_t uflimit, ulimit, uthresh; 129 130 uflimit = *blimit; 131 ulimit = *limit; 132 uthresh = *thresh; 133 134 /* create quad-byte */ 135 __asm__ __volatile__ ( 136 "replv.qb %[thresh_vec], %[uthresh] \n\t" 137 "replv.qb %[flimit_vec], %[uflimit] \n\t" 138 "replv.qb %[limit_vec], %[ulimit] \n\t" 139 140 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), 141 [limit_vec] "=r" (limit_vec) 142 : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) 143 ); 144 145 /* prefetch data for store */ 146 prefetch_store(s + pitch); 147 148 for (i = 0; i < 2; i++) { 149 s1 = s; 150 s2 = s + pitch; 151 s3 = s2 + pitch; 152 s4 = s3 + pitch; 153 s = s4 + pitch; 154 155 /* load quad-byte vectors 156 * memory is 4 byte aligned 157 */ 158 p2 = *((uint32_t *)(s1 - 4)); 159 p6 = *((uint32_t *)(s1)); 160 p1 = *((uint32_t *)(s2 - 4)); 161 p5 = *((uint32_t *)(s2)); 162 p0 = *((uint32_t *)(s3 - 4)); 163 p4 = *((uint32_t *)(s3)); 164 pm1 = *((uint32_t *)(s4 - 4)); 165 p3 = *((uint32_t *)(s4)); 166 167 /* transpose pm1, p0, p1, p2 */ 168 __asm__ __volatile__ ( 169 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 170 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 171 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 172 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 173 174 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 175 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 176 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 177 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 178 179 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 180 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 181 "append %[p1], %[sec3], 16 \n\t" 182 "append %[pm1], %[sec4], 16 \n\t" 183 184 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), 185 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), 186 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), 187 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) 188 : 189 ); 190 191 /* transpose p3, p4, p5, p6 */ 192 __asm__ __volatile__ ( 193 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 194 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 195 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 196 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 197 198 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 199 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 200 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 201 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 202 203 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 204 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 205 "append %[p5], %[sec3], 16 \n\t" 206 "append %[p3], %[sec4], 16 \n\t" 207 208 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), 209 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), 210 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), 211 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) 212 : 213 ); 214 215 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 216 * mask will be zero and filtering is not needed 217 */ 218 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 219 filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, 220 p0, p3, p4, p5, p6, thresh_vec, 221 &hev, &mask); 222 223 /* if mask == 0 do filtering is not needed */ 224 if (mask) { 225 /* filtering */ 226 filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); 227 228 /* unpack processed 4x4 neighborhood 229 * don't use transpose on output data 230 * because memory isn't aligned 231 */ 232 __asm__ __volatile__ ( 233 "sb %[p4], 1(%[s4]) \n\t" 234 "sb %[p3], 0(%[s4]) \n\t" 235 "sb %[p2], -1(%[s4]) \n\t" 236 "sb %[p1], -2(%[s4]) \n\t" 237 238 : 239 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), 240 [s4] "r" (s4) 241 ); 242 243 __asm__ __volatile__ ( 244 "srl %[p4], %[p4], 8 \n\t" 245 "srl %[p3], %[p3], 8 \n\t" 246 "srl %[p2], %[p2], 8 \n\t" 247 "srl %[p1], %[p1], 8 \n\t" 248 249 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) 250 : 251 ); 252 253 __asm__ __volatile__ ( 254 "sb %[p4], 1(%[s3]) \n\t" 255 "sb %[p3], 0(%[s3]) \n\t" 256 "sb %[p2], -1(%[s3]) \n\t" 257 "sb %[p1], -2(%[s3]) \n\t" 258 259 : [p1] "+r" (p1) 260 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3) 261 ); 262 263 __asm__ __volatile__ ( 264 "srl %[p4], %[p4], 8 \n\t" 265 "srl %[p3], %[p3], 8 \n\t" 266 "srl %[p2], %[p2], 8 \n\t" 267 "srl %[p1], %[p1], 8 \n\t" 268 269 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) 270 : 271 ); 272 273 __asm__ __volatile__ ( 274 "sb %[p4], 1(%[s2]) \n\t" 275 "sb %[p3], 0(%[s2]) \n\t" 276 "sb %[p2], -1(%[s2]) \n\t" 277 "sb %[p1], -2(%[s2]) \n\t" 278 279 : 280 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), 281 [s2] "r" (s2) 282 ); 283 284 __asm__ __volatile__ ( 285 "srl %[p4], %[p4], 8 \n\t" 286 "srl %[p3], %[p3], 8 \n\t" 287 "srl %[p2], %[p2], 8 \n\t" 288 "srl %[p1], %[p1], 8 \n\t" 289 290 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) 291 : 292 ); 293 294 __asm__ __volatile__ ( 295 "sb %[p4], 1(%[s1]) \n\t" 296 "sb %[p3], 0(%[s1]) \n\t" 297 "sb %[p2], -1(%[s1]) \n\t" 298 "sb %[p1], -2(%[s1]) \n\t" 299 300 : 301 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), 302 [s1] "r" (s1) 303 ); 304 } 305 } 306 } 307} 308 309void vpx_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */, 310 const uint8_t *blimit0, 311 const uint8_t *limit0, 312 const uint8_t *thresh0, 313 const uint8_t *blimit1, 314 const uint8_t *limit1, 315 const uint8_t *thresh1) { 316 vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1); 317 vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); 318} 319 320void vpx_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, 321 const uint8_t *blimit0, 322 const uint8_t *limit0, 323 const uint8_t *thresh0, 324 const uint8_t *blimit1, 325 const uint8_t *limit1, 326 const uint8_t *thresh1) { 327 vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1); 328 vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); 329} 330 331void vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, 332 const uint8_t *blimit0, 333 const uint8_t *limit0, 334 const uint8_t *thresh0, 335 const uint8_t *blimit1, 336 const uint8_t *limit1, 337 const uint8_t *thresh1) { 338 vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1); 339 vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1); 340} 341 342void vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, 343 const uint8_t *blimit0, 344 const uint8_t *limit0, 345 const uint8_t *thresh0, 346 const uint8_t *blimit1, 347 const uint8_t *limit1, 348 const uint8_t *thresh1) { 349 vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1); 350 vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 351 1); 352} 353 354void vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, 355 const uint8_t *blimit, 356 const uint8_t *limit, 357 const uint8_t *thresh) { 358 vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); 359 vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); 360} 361#endif // #if HAVE_DSPR2 362