1/* 2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include <stdlib.h> 12 13#include "./vp9_rtcd.h" 14#include "vp9/common/vp9_common.h" 15#include "vp9/common/vp9_loopfilter.h" 16#include "vp9/common/vp9_onyxc_int.h" 17#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" 18#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h" 19#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h" 20#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h" 21 22#if HAVE_DSPR2 23void vp9_lpf_horizontal_4_dspr2(unsigned char *s, 24 int pitch, 25 const uint8_t *blimit, 26 const uint8_t *limit, 27 const uint8_t *thresh, 28 int count) { 29 uint8_t i; 30 uint32_t mask; 31 uint32_t hev; 32 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 33 uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; 34 uint32_t thresh_vec, flimit_vec, limit_vec; 35 uint32_t uflimit, ulimit, uthresh; 36 37 uflimit = *blimit; 38 ulimit = *limit; 39 uthresh = *thresh; 40 41 /* create quad-byte */ 42 __asm__ __volatile__ ( 43 "replv.qb %[thresh_vec], %[uthresh] \n\t" 44 "replv.qb %[flimit_vec], %[uflimit] \n\t" 45 "replv.qb %[limit_vec], %[ulimit] \n\t" 46 47 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), 48 [limit_vec] "=r" (limit_vec) 49 : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) 50 ); 51 52 /* prefetch data for store */ 53 vp9_prefetch_store(s); 54 55 /* loop filter designed to work using chars so that we can make maximum use 56 of 8 bit simd instructions. */ 57 for (i = 0; i < 2; i++) { 58 sm1 = s - (pitch << 2); 59 s0 = sm1 + pitch; 60 s1 = s0 + pitch; 61 s2 = s - pitch; 62 s3 = s; 63 s4 = s + pitch; 64 s5 = s4 + pitch; 65 s6 = s5 + pitch; 66 67 __asm__ __volatile__ ( 68 "lw %[p1], (%[s1]) \n\t" 69 "lw %[p2], (%[s2]) \n\t" 70 "lw %[p3], (%[s3]) \n\t" 71 "lw %[p4], (%[s4]) \n\t" 72 73 : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4) 74 : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) 75 ); 76 77 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 78 mask will be zero and filtering is not needed */ 79 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 80 __asm__ __volatile__ ( 81 "lw %[pm1], (%[sm1]) \n\t" 82 "lw %[p0], (%[s0]) \n\t" 83 "lw %[p5], (%[s5]) \n\t" 84 "lw %[p6], (%[s6]) \n\t" 85 86 : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5), 87 [p6] "=&r" (p6) 88 : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6) 89 ); 90 91 vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, 92 pm1, p0, p3, p4, p5, p6, 93 thresh_vec, &hev, &mask); 94 95 /* if mask == 0 do filtering is not needed */ 96 if (mask) { 97 /* filtering */ 98 vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); 99 100 __asm__ __volatile__ ( 101 "sw %[p1], (%[s1]) \n\t" 102 "sw %[p2], (%[s2]) \n\t" 103 "sw %[p3], (%[s3]) \n\t" 104 "sw %[p4], (%[s4]) \n\t" 105 106 : 107 : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4), 108 [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4) 109 ); 110 } 111 } 112 113 s = s + 4; 114 } 115} 116 117void vp9_lpf_vertical_4_dspr2(unsigned char *s, 118 int pitch, 119 const uint8_t *blimit, 120 const uint8_t *limit, 121 const uint8_t *thresh, 122 int count) { 123 uint8_t i; 124 uint32_t mask, hev; 125 uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; 126 uint8_t *s1, *s2, *s3, *s4; 127 uint32_t prim1, prim2, sec3, sec4, prim3, prim4; 128 uint32_t thresh_vec, flimit_vec, limit_vec; 129 uint32_t uflimit, ulimit, uthresh; 130 131 uflimit = *blimit; 132 ulimit = *limit; 133 uthresh = *thresh; 134 135 /* create quad-byte */ 136 __asm__ __volatile__ ( 137 "replv.qb %[thresh_vec], %[uthresh] \n\t" 138 "replv.qb %[flimit_vec], %[uflimit] \n\t" 139 "replv.qb %[limit_vec], %[ulimit] \n\t" 140 141 : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec), 142 [limit_vec] "=r" (limit_vec) 143 : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit) 144 ); 145 146 /* prefetch data for store */ 147 vp9_prefetch_store(s + pitch); 148 149 for (i = 0; i < 2; i++) { 150 s1 = s; 151 s2 = s + pitch; 152 s3 = s2 + pitch; 153 s4 = s3 + pitch; 154 s = s4 + pitch; 155 156 /* load quad-byte vectors 157 * memory is 4 byte aligned 158 */ 159 p2 = *((uint32_t *)(s1 - 4)); 160 p6 = *((uint32_t *)(s1)); 161 p1 = *((uint32_t *)(s2 - 4)); 162 p5 = *((uint32_t *)(s2)); 163 p0 = *((uint32_t *)(s3 - 4)); 164 p4 = *((uint32_t *)(s3)); 165 pm1 = *((uint32_t *)(s4 - 4)); 166 p3 = *((uint32_t *)(s4)); 167 168 /* transpose pm1, p0, p1, p2 */ 169 __asm__ __volatile__ ( 170 "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t" 171 "precr.qb.ph %[prim2], %[p2], %[p1] \n\t" 172 "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t" 173 "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t" 174 175 "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t" 176 "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t" 177 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 178 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 179 180 "precrq.ph.w %[p2], %[p1], %[sec3] \n\t" 181 "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t" 182 "append %[p1], %[sec3], 16 \n\t" 183 "append %[pm1], %[sec4], 16 \n\t" 184 185 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), 186 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), 187 [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1), 188 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) 189 : 190 ); 191 192 /* transpose p3, p4, p5, p6 */ 193 __asm__ __volatile__ ( 194 "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t" 195 "precr.qb.ph %[prim2], %[p6], %[p5] \n\t" 196 "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t" 197 "precr.qb.ph %[prim4], %[p4], %[p3] \n\t" 198 199 "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t" 200 "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t" 201 "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t" 202 "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t" 203 204 "precrq.ph.w %[p6], %[p5], %[sec3] \n\t" 205 "precrq.ph.w %[p4], %[p3], %[sec4] \n\t" 206 "append %[p5], %[sec3], 16 \n\t" 207 "append %[p3], %[sec4], 16 \n\t" 208 209 : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2), 210 [prim3] "=&r" (prim3), [prim4] "=&r" (prim4), 211 [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3), 212 [sec3] "=&r" (sec3), [sec4] "=&r" (sec4) 213 : 214 ); 215 216 /* if (p1 - p4 == 0) and (p2 - p3 == 0) 217 * mask will be zero and filtering is not needed 218 */ 219 if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) { 220 vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, 221 p0, p3, p4, p5, p6, thresh_vec, 222 &hev, &mask); 223 224 /* if mask == 0 do filtering is not needed */ 225 if (mask) { 226 /* filtering */ 227 vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4); 228 229 /* unpack processed 4x4 neighborhood 230 * don't use transpose on output data 231 * because memory isn't aligned 232 */ 233 __asm__ __volatile__ ( 234 "sb %[p4], 1(%[s4]) \n\t" 235 "sb %[p3], 0(%[s4]) \n\t" 236 "sb %[p2], -1(%[s4]) \n\t" 237 "sb %[p1], -2(%[s4]) \n\t" 238 239 : 240 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), 241 [s4] "r" (s4) 242 ); 243 244 __asm__ __volatile__ ( 245 "srl %[p4], %[p4], 8 \n\t" 246 "srl %[p3], %[p3], 8 \n\t" 247 "srl %[p2], %[p2], 8 \n\t" 248 "srl %[p1], %[p1], 8 \n\t" 249 250 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) 251 : 252 ); 253 254 __asm__ __volatile__ ( 255 "sb %[p4], 1(%[s3]) \n\t" 256 "sb %[p3], 0(%[s3]) \n\t" 257 "sb %[p2], -1(%[s3]) \n\t" 258 "sb %[p1], -2(%[s3]) \n\t" 259 260 : [p1] "+r" (p1) 261 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3) 262 ); 263 264 __asm__ __volatile__ ( 265 "srl %[p4], %[p4], 8 \n\t" 266 "srl %[p3], %[p3], 8 \n\t" 267 "srl %[p2], %[p2], 8 \n\t" 268 "srl %[p1], %[p1], 8 \n\t" 269 270 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) 271 : 272 ); 273 274 __asm__ __volatile__ ( 275 "sb %[p4], 1(%[s2]) \n\t" 276 "sb %[p3], 0(%[s2]) \n\t" 277 "sb %[p2], -1(%[s2]) \n\t" 278 "sb %[p1], -2(%[s2]) \n\t" 279 280 : 281 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), 282 [s2] "r" (s2) 283 ); 284 285 __asm__ __volatile__ ( 286 "srl %[p4], %[p4], 8 \n\t" 287 "srl %[p3], %[p3], 8 \n\t" 288 "srl %[p2], %[p2], 8 \n\t" 289 "srl %[p1], %[p1], 8 \n\t" 290 291 : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1) 292 : 293 ); 294 295 __asm__ __volatile__ ( 296 "sb %[p4], 1(%[s1]) \n\t" 297 "sb %[p3], 0(%[s1]) \n\t" 298 "sb %[p2], -1(%[s1]) \n\t" 299 "sb %[p1], -2(%[s1]) \n\t" 300 301 : 302 : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), 303 [s1] "r" (s1) 304 ); 305 } 306 } 307 } 308} 309 310void vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */, 311 const uint8_t *blimit0, 312 const uint8_t *limit0, 313 const uint8_t *thresh0, 314 const uint8_t *blimit1, 315 const uint8_t *limit1, 316 const uint8_t *thresh1) { 317 vp9_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1); 318 vp9_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); 319} 320 321void vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */, 322 const uint8_t *blimit0, 323 const uint8_t *limit0, 324 const uint8_t *thresh0, 325 const uint8_t *blimit1, 326 const uint8_t *limit1, 327 const uint8_t *thresh1) { 328 vp9_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1); 329 vp9_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1); 330} 331 332void vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, 333 const uint8_t *blimit0, 334 const uint8_t *limit0, 335 const uint8_t *thresh0, 336 const uint8_t *blimit1, 337 const uint8_t *limit1, 338 const uint8_t *thresh1) { 339 vp9_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1); 340 vp9_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1); 341} 342 343void vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, 344 const uint8_t *blimit0, 345 const uint8_t *limit0, 346 const uint8_t *thresh0, 347 const uint8_t *blimit1, 348 const uint8_t *limit1, 349 const uint8_t *thresh1) { 350 vp9_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1); 351 vp9_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 352 1); 353} 354 355void vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, 356 const uint8_t *blimit, 357 const uint8_t *limit, 358 const uint8_t *thresh) { 359 vp9_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh); 360 vp9_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh); 361} 362#endif // #if HAVE_DSPR2 363