19b35249446b07f40ac5fcc3205f2c048616efacchkuang/*
29b35249446b07f40ac5fcc3205f2c048616efacchkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
39b35249446b07f40ac5fcc3205f2c048616efacchkuang *
49b35249446b07f40ac5fcc3205f2c048616efacchkuang *  Use of this source code is governed by a BSD-style license
59b35249446b07f40ac5fcc3205f2c048616efacchkuang *  that can be found in the LICENSE file in the root of the source
69b35249446b07f40ac5fcc3205f2c048616efacchkuang *  tree. An additional intellectual property rights grant can be found
79b35249446b07f40ac5fcc3205f2c048616efacchkuang *  in the file PATENTS.  All contributing project authors may
89b35249446b07f40ac5fcc3205f2c048616efacchkuang *  be found in the AUTHORS file in the root of the source tree.
99b35249446b07f40ac5fcc3205f2c048616efacchkuang */
109b35249446b07f40ac5fcc3205f2c048616efacchkuang
119b35249446b07f40ac5fcc3205f2c048616efacchkuang#include <stdlib.h>
129b35249446b07f40ac5fcc3205f2c048616efacchkuang
139b35249446b07f40ac5fcc3205f2c048616efacchkuang#include "./vp9_rtcd.h"
149b35249446b07f40ac5fcc3205f2c048616efacchkuang#include "vp9/common/vp9_common.h"
159b35249446b07f40ac5fcc3205f2c048616efacchkuang#include "vp9/common/vp9_loopfilter.h"
169b35249446b07f40ac5fcc3205f2c048616efacchkuang#include "vp9/common/vp9_onyxc_int.h"
179b35249446b07f40ac5fcc3205f2c048616efacchkuang#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
189b35249446b07f40ac5fcc3205f2c048616efacchkuang#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
199b35249446b07f40ac5fcc3205f2c048616efacchkuang#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
209b35249446b07f40ac5fcc3205f2c048616efacchkuang#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
219b35249446b07f40ac5fcc3205f2c048616efacchkuang
229b35249446b07f40ac5fcc3205f2c048616efacchkuang#if HAVE_DSPR2
23b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_lpf_horizontal_4_dspr2(unsigned char *s,
24b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                int pitch,
25b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                const uint8_t *blimit,
26b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                const uint8_t *limit,
27b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                const uint8_t *thresh,
28b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                int count) {
299b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint8_t   i;
309b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  mask;
319b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  hev;
329b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
339b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
349b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  thresh_vec, flimit_vec, limit_vec;
359b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  uflimit, ulimit, uthresh;
369b35249446b07f40ac5fcc3205f2c048616efacchkuang
379b35249446b07f40ac5fcc3205f2c048616efacchkuang  uflimit = *blimit;
389b35249446b07f40ac5fcc3205f2c048616efacchkuang  ulimit = *limit;
399b35249446b07f40ac5fcc3205f2c048616efacchkuang  uthresh = *thresh;
409b35249446b07f40ac5fcc3205f2c048616efacchkuang
419b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* create quad-byte */
429b35249446b07f40ac5fcc3205f2c048616efacchkuang  __asm__ __volatile__ (
439b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
449b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
459b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
469b35249446b07f40ac5fcc3205f2c048616efacchkuang
479b35249446b07f40ac5fcc3205f2c048616efacchkuang      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
489b35249446b07f40ac5fcc3205f2c048616efacchkuang        [limit_vec] "=r" (limit_vec)
499b35249446b07f40ac5fcc3205f2c048616efacchkuang      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
509b35249446b07f40ac5fcc3205f2c048616efacchkuang  );
519b35249446b07f40ac5fcc3205f2c048616efacchkuang
529b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* prefetch data for store */
539b35249446b07f40ac5fcc3205f2c048616efacchkuang  vp9_prefetch_store(s);
549b35249446b07f40ac5fcc3205f2c048616efacchkuang
559b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* loop filter designed to work using chars so that we can make maximum use
569b35249446b07f40ac5fcc3205f2c048616efacchkuang     of 8 bit simd instructions. */
579b35249446b07f40ac5fcc3205f2c048616efacchkuang  for (i = 0; i < 2; i++) {
589b35249446b07f40ac5fcc3205f2c048616efacchkuang    sm1 = s - (pitch << 2);
599b35249446b07f40ac5fcc3205f2c048616efacchkuang    s0 = sm1 + pitch;
609b35249446b07f40ac5fcc3205f2c048616efacchkuang    s1 = s0 + pitch;
619b35249446b07f40ac5fcc3205f2c048616efacchkuang    s2 = s - pitch;
629b35249446b07f40ac5fcc3205f2c048616efacchkuang    s3 = s;
639b35249446b07f40ac5fcc3205f2c048616efacchkuang    s4 = s + pitch;
649b35249446b07f40ac5fcc3205f2c048616efacchkuang    s5 = s4 + pitch;
659b35249446b07f40ac5fcc3205f2c048616efacchkuang    s6 = s5 + pitch;
669b35249446b07f40ac5fcc3205f2c048616efacchkuang
679b35249446b07f40ac5fcc3205f2c048616efacchkuang    __asm__ __volatile__ (
689b35249446b07f40ac5fcc3205f2c048616efacchkuang        "lw     %[p1],  (%[s1])    \n\t"
699b35249446b07f40ac5fcc3205f2c048616efacchkuang        "lw     %[p2],  (%[s2])    \n\t"
709b35249446b07f40ac5fcc3205f2c048616efacchkuang        "lw     %[p3],  (%[s3])    \n\t"
719b35249446b07f40ac5fcc3205f2c048616efacchkuang        "lw     %[p4],  (%[s4])    \n\t"
729b35249446b07f40ac5fcc3205f2c048616efacchkuang
739b35249446b07f40ac5fcc3205f2c048616efacchkuang        : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
749b35249446b07f40ac5fcc3205f2c048616efacchkuang        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
759b35249446b07f40ac5fcc3205f2c048616efacchkuang    );
769b35249446b07f40ac5fcc3205f2c048616efacchkuang
779b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
789b35249446b07f40ac5fcc3205f2c048616efacchkuang       mask will be zero and filtering is not needed */
799b35249446b07f40ac5fcc3205f2c048616efacchkuang    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
809b35249446b07f40ac5fcc3205f2c048616efacchkuang      __asm__ __volatile__ (
819b35249446b07f40ac5fcc3205f2c048616efacchkuang          "lw       %[pm1], (%[sm1])   \n\t"
829b35249446b07f40ac5fcc3205f2c048616efacchkuang          "lw       %[p0],  (%[s0])    \n\t"
839b35249446b07f40ac5fcc3205f2c048616efacchkuang          "lw       %[p5],  (%[s5])    \n\t"
849b35249446b07f40ac5fcc3205f2c048616efacchkuang          "lw       %[p6],  (%[s6])    \n\t"
859b35249446b07f40ac5fcc3205f2c048616efacchkuang
869b35249446b07f40ac5fcc3205f2c048616efacchkuang          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
879b35249446b07f40ac5fcc3205f2c048616efacchkuang            [p6] "=&r" (p6)
889b35249446b07f40ac5fcc3205f2c048616efacchkuang          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
899b35249446b07f40ac5fcc3205f2c048616efacchkuang      );
909b35249446b07f40ac5fcc3205f2c048616efacchkuang
919b35249446b07f40ac5fcc3205f2c048616efacchkuang      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
929b35249446b07f40ac5fcc3205f2c048616efacchkuang                                pm1, p0, p3, p4, p5, p6,
939b35249446b07f40ac5fcc3205f2c048616efacchkuang                                thresh_vec, &hev, &mask);
949b35249446b07f40ac5fcc3205f2c048616efacchkuang
959b35249446b07f40ac5fcc3205f2c048616efacchkuang      /* if mask == 0 do filtering is not needed */
969b35249446b07f40ac5fcc3205f2c048616efacchkuang      if (mask) {
979b35249446b07f40ac5fcc3205f2c048616efacchkuang        /* filtering */
989b35249446b07f40ac5fcc3205f2c048616efacchkuang        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
999b35249446b07f40ac5fcc3205f2c048616efacchkuang
1009b35249446b07f40ac5fcc3205f2c048616efacchkuang        __asm__ __volatile__ (
1019b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sw     %[p1],  (%[s1])    \n\t"
1029b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sw     %[p2],  (%[s2])    \n\t"
1039b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sw     %[p3],  (%[s3])    \n\t"
1049b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sw     %[p4],  (%[s4])    \n\t"
1059b35249446b07f40ac5fcc3205f2c048616efacchkuang
1069b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
1079b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
1089b35249446b07f40ac5fcc3205f2c048616efacchkuang              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
1099b35249446b07f40ac5fcc3205f2c048616efacchkuang        );
1109b35249446b07f40ac5fcc3205f2c048616efacchkuang      }
1119b35249446b07f40ac5fcc3205f2c048616efacchkuang    }
1129b35249446b07f40ac5fcc3205f2c048616efacchkuang
1139b35249446b07f40ac5fcc3205f2c048616efacchkuang    s = s + 4;
1149b35249446b07f40ac5fcc3205f2c048616efacchkuang  }
1159b35249446b07f40ac5fcc3205f2c048616efacchkuang}
1169b35249446b07f40ac5fcc3205f2c048616efacchkuang
117b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_lpf_vertical_4_dspr2(unsigned char *s,
118b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                              int pitch,
119b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                              const uint8_t *blimit,
120b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                              const uint8_t *limit,
121b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                              const uint8_t *thresh,
122b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                              int count) {
1239b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint8_t   i;
1249b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  mask, hev;
1259b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
1269b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint8_t   *s1, *s2, *s3, *s4;
1279b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
1289b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  thresh_vec, flimit_vec, limit_vec;
1299b35249446b07f40ac5fcc3205f2c048616efacchkuang  uint32_t  uflimit, ulimit, uthresh;
1309b35249446b07f40ac5fcc3205f2c048616efacchkuang
1319b35249446b07f40ac5fcc3205f2c048616efacchkuang  uflimit = *blimit;
1329b35249446b07f40ac5fcc3205f2c048616efacchkuang  ulimit = *limit;
1339b35249446b07f40ac5fcc3205f2c048616efacchkuang  uthresh = *thresh;
1349b35249446b07f40ac5fcc3205f2c048616efacchkuang
1359b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* create quad-byte */
1369b35249446b07f40ac5fcc3205f2c048616efacchkuang  __asm__ __volatile__ (
1379b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
1389b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
1399b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
1409b35249446b07f40ac5fcc3205f2c048616efacchkuang
1419b35249446b07f40ac5fcc3205f2c048616efacchkuang      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
1429b35249446b07f40ac5fcc3205f2c048616efacchkuang        [limit_vec] "=r" (limit_vec)
1439b35249446b07f40ac5fcc3205f2c048616efacchkuang      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
1449b35249446b07f40ac5fcc3205f2c048616efacchkuang  );
1459b35249446b07f40ac5fcc3205f2c048616efacchkuang
1469b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* prefetch data for store */
1479b35249446b07f40ac5fcc3205f2c048616efacchkuang  vp9_prefetch_store(s + pitch);
1489b35249446b07f40ac5fcc3205f2c048616efacchkuang
1499b35249446b07f40ac5fcc3205f2c048616efacchkuang  for (i = 0; i < 2; i++) {
1509b35249446b07f40ac5fcc3205f2c048616efacchkuang    s1 = s;
1519b35249446b07f40ac5fcc3205f2c048616efacchkuang    s2 = s + pitch;
1529b35249446b07f40ac5fcc3205f2c048616efacchkuang    s3 = s2 + pitch;
1539b35249446b07f40ac5fcc3205f2c048616efacchkuang    s4 = s3 + pitch;
1549b35249446b07f40ac5fcc3205f2c048616efacchkuang    s  = s4 + pitch;
1559b35249446b07f40ac5fcc3205f2c048616efacchkuang
1569b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* load quad-byte vectors
1579b35249446b07f40ac5fcc3205f2c048616efacchkuang     * memory is 4 byte aligned
1589b35249446b07f40ac5fcc3205f2c048616efacchkuang     */
1599b35249446b07f40ac5fcc3205f2c048616efacchkuang    p2  = *((uint32_t *)(s1 - 4));
1609b35249446b07f40ac5fcc3205f2c048616efacchkuang    p6  = *((uint32_t *)(s1));
1619b35249446b07f40ac5fcc3205f2c048616efacchkuang    p1  = *((uint32_t *)(s2 - 4));
1629b35249446b07f40ac5fcc3205f2c048616efacchkuang    p5  = *((uint32_t *)(s2));
1639b35249446b07f40ac5fcc3205f2c048616efacchkuang    p0  = *((uint32_t *)(s3 - 4));
1649b35249446b07f40ac5fcc3205f2c048616efacchkuang    p4  = *((uint32_t *)(s3));
1659b35249446b07f40ac5fcc3205f2c048616efacchkuang    pm1 = *((uint32_t *)(s4 - 4));
1669b35249446b07f40ac5fcc3205f2c048616efacchkuang    p3  = *((uint32_t *)(s4));
1679b35249446b07f40ac5fcc3205f2c048616efacchkuang
1689b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* transpose pm1, p0, p1, p2 */
1699b35249446b07f40ac5fcc3205f2c048616efacchkuang    __asm__ __volatile__ (
1709b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1719b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1729b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1739b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1749b35249446b07f40ac5fcc3205f2c048616efacchkuang
1759b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1769b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1779b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1789b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1799b35249446b07f40ac5fcc3205f2c048616efacchkuang
1809b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1819b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1829b35249446b07f40ac5fcc3205f2c048616efacchkuang        "append         %[p1],      %[sec3],    16          \n\t"
1839b35249446b07f40ac5fcc3205f2c048616efacchkuang        "append         %[pm1],     %[sec4],    16          \n\t"
1849b35249446b07f40ac5fcc3205f2c048616efacchkuang
1859b35249446b07f40ac5fcc3205f2c048616efacchkuang        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
1869b35249446b07f40ac5fcc3205f2c048616efacchkuang          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
1879b35249446b07f40ac5fcc3205f2c048616efacchkuang          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
1889b35249446b07f40ac5fcc3205f2c048616efacchkuang          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
1899b35249446b07f40ac5fcc3205f2c048616efacchkuang        :
1909b35249446b07f40ac5fcc3205f2c048616efacchkuang    );
1919b35249446b07f40ac5fcc3205f2c048616efacchkuang
1929b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* transpose p3, p4, p5, p6 */
1939b35249446b07f40ac5fcc3205f2c048616efacchkuang    __asm__ __volatile__ (
1949b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1959b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1969b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1979b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1989b35249446b07f40ac5fcc3205f2c048616efacchkuang
1999b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
2009b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
2019b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
2029b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
2039b35249446b07f40ac5fcc3205f2c048616efacchkuang
2049b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
2059b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
2069b35249446b07f40ac5fcc3205f2c048616efacchkuang        "append         %[p5],      %[sec3],    16          \n\t"
2079b35249446b07f40ac5fcc3205f2c048616efacchkuang        "append         %[p3],      %[sec4],    16          \n\t"
2089b35249446b07f40ac5fcc3205f2c048616efacchkuang
2099b35249446b07f40ac5fcc3205f2c048616efacchkuang        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
2109b35249446b07f40ac5fcc3205f2c048616efacchkuang          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
2119b35249446b07f40ac5fcc3205f2c048616efacchkuang          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
2129b35249446b07f40ac5fcc3205f2c048616efacchkuang          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
2139b35249446b07f40ac5fcc3205f2c048616efacchkuang        :
2149b35249446b07f40ac5fcc3205f2c048616efacchkuang    );
2159b35249446b07f40ac5fcc3205f2c048616efacchkuang
2169b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
2179b35249446b07f40ac5fcc3205f2c048616efacchkuang     * mask will be zero and filtering is not needed
2189b35249446b07f40ac5fcc3205f2c048616efacchkuang     */
2199b35249446b07f40ac5fcc3205f2c048616efacchkuang    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
2209b35249446b07f40ac5fcc3205f2c048616efacchkuang      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
2219b35249446b07f40ac5fcc3205f2c048616efacchkuang                                p0, p3, p4, p5, p6, thresh_vec,
2229b35249446b07f40ac5fcc3205f2c048616efacchkuang                                &hev, &mask);
2239b35249446b07f40ac5fcc3205f2c048616efacchkuang
2249b35249446b07f40ac5fcc3205f2c048616efacchkuang      /* if mask == 0 do filtering is not needed */
2259b35249446b07f40ac5fcc3205f2c048616efacchkuang      if (mask) {
2269b35249446b07f40ac5fcc3205f2c048616efacchkuang        /* filtering */
2279b35249446b07f40ac5fcc3205f2c048616efacchkuang        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
2289b35249446b07f40ac5fcc3205f2c048616efacchkuang
2299b35249446b07f40ac5fcc3205f2c048616efacchkuang        /* unpack processed 4x4 neighborhood
2309b35249446b07f40ac5fcc3205f2c048616efacchkuang         * don't use transpose on output data
2319b35249446b07f40ac5fcc3205f2c048616efacchkuang         * because memory isn't aligned
2329b35249446b07f40ac5fcc3205f2c048616efacchkuang         */
2339b35249446b07f40ac5fcc3205f2c048616efacchkuang        __asm__ __volatile__ (
2349b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p4],   1(%[s4])    \n\t"
2359b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p3],   0(%[s4])    \n\t"
2369b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p2],  -1(%[s4])    \n\t"
2379b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p1],  -2(%[s4])    \n\t"
2389b35249446b07f40ac5fcc3205f2c048616efacchkuang
2399b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
2409b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
2419b35249446b07f40ac5fcc3205f2c048616efacchkuang              [s4] "r" (s4)
2429b35249446b07f40ac5fcc3205f2c048616efacchkuang        );
2439b35249446b07f40ac5fcc3205f2c048616efacchkuang
2449b35249446b07f40ac5fcc3205f2c048616efacchkuang        __asm__ __volatile__ (
2459b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p4],  %[p4],  8     \n\t"
2469b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p3],  %[p3],  8     \n\t"
2479b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p2],  %[p2],  8     \n\t"
2489b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p1],  %[p1],  8     \n\t"
2499b35249446b07f40ac5fcc3205f2c048616efacchkuang
2509b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
2519b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
2529b35249446b07f40ac5fcc3205f2c048616efacchkuang        );
2539b35249446b07f40ac5fcc3205f2c048616efacchkuang
2549b35249446b07f40ac5fcc3205f2c048616efacchkuang        __asm__ __volatile__ (
2559b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p4],   1(%[s3])    \n\t"
2569b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p3],   0(%[s3])    \n\t"
2579b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p2],  -1(%[s3])    \n\t"
2589b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p1],  -2(%[s3])    \n\t"
2599b35249446b07f40ac5fcc3205f2c048616efacchkuang
2609b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p1] "+r" (p1)
2619b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
2629b35249446b07f40ac5fcc3205f2c048616efacchkuang        );
2639b35249446b07f40ac5fcc3205f2c048616efacchkuang
2649b35249446b07f40ac5fcc3205f2c048616efacchkuang        __asm__ __volatile__ (
2659b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p4],  %[p4],  8     \n\t"
2669b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p3],  %[p3],  8     \n\t"
2679b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p2],  %[p2],  8     \n\t"
2689b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p1],  %[p1],  8     \n\t"
2699b35249446b07f40ac5fcc3205f2c048616efacchkuang
2709b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
2719b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
2729b35249446b07f40ac5fcc3205f2c048616efacchkuang        );
2739b35249446b07f40ac5fcc3205f2c048616efacchkuang
2749b35249446b07f40ac5fcc3205f2c048616efacchkuang        __asm__ __volatile__ (
2759b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p4],   1(%[s2])    \n\t"
2769b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p3],   0(%[s2])    \n\t"
2779b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p2],  -1(%[s2])    \n\t"
2789b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p1],  -2(%[s2])    \n\t"
2799b35249446b07f40ac5fcc3205f2c048616efacchkuang
2809b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
2819b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
2829b35249446b07f40ac5fcc3205f2c048616efacchkuang              [s2] "r" (s2)
2839b35249446b07f40ac5fcc3205f2c048616efacchkuang        );
2849b35249446b07f40ac5fcc3205f2c048616efacchkuang
2859b35249446b07f40ac5fcc3205f2c048616efacchkuang        __asm__ __volatile__ (
2869b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p4],  %[p4],  8     \n\t"
2879b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p3],  %[p3],  8     \n\t"
2889b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p2],  %[p2],  8     \n\t"
2899b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p1],  %[p1],  8     \n\t"
2909b35249446b07f40ac5fcc3205f2c048616efacchkuang
2919b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
2929b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
2939b35249446b07f40ac5fcc3205f2c048616efacchkuang        );
2949b35249446b07f40ac5fcc3205f2c048616efacchkuang
2959b35249446b07f40ac5fcc3205f2c048616efacchkuang        __asm__ __volatile__ (
2969b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p4],   1(%[s1])    \n\t"
2979b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p3],   0(%[s1])    \n\t"
2989b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p2],  -1(%[s1])    \n\t"
2999b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p1],  -2(%[s1])    \n\t"
3009b35249446b07f40ac5fcc3205f2c048616efacchkuang
3019b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
3029b35249446b07f40ac5fcc3205f2c048616efacchkuang            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
3039b35249446b07f40ac5fcc3205f2c048616efacchkuang              [s1] "r" (s1)
3049b35249446b07f40ac5fcc3205f2c048616efacchkuang        );
3059b35249446b07f40ac5fcc3205f2c048616efacchkuang      }
3069b35249446b07f40ac5fcc3205f2c048616efacchkuang    }
3079b35249446b07f40ac5fcc3205f2c048616efacchkuang  }
3089b35249446b07f40ac5fcc3205f2c048616efacchkuang}
309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
310b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_lpf_horizontal_4_dual_dspr2(uint8_t *s, int p /* pitch */,
311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *blimit0,
312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *limit0,
313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *thresh0,
314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *blimit1,
315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *limit1,
316b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *thresh1) {
317b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
320b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_lpf_horizontal_8_dual_dspr2(uint8_t *s, int p /* pitch */,
322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *blimit0,
323b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *limit0,
324b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *thresh0,
325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *blimit1,
326b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *limit1,
327b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                     const uint8_t *thresh1) {
328b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
329b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
330b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
331b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
332b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_lpf_vertical_4_dual_dspr2(uint8_t *s, int p,
333b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *blimit0,
334b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *limit0,
335b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *thresh0,
336b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *blimit1,
337b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *limit1,
338b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *thresh1) {
339b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0, 1);
340b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1, 1);
341b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
342b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
343b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_lpf_vertical_8_dual_dspr2(uint8_t *s, int p,
344b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *blimit0,
345b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *limit0,
346b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *thresh0,
347b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *blimit1,
348b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *limit1,
349b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *thresh1) {
350b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0, 1);
351b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
352b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                       1);
353b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
354b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
355b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanianvoid vp9_lpf_vertical_16_dual_dspr2(uint8_t *s, int p,
356b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                    const uint8_t *blimit,
357b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                    const uint8_t *limit,
358b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                    const uint8_t *thresh) {
359b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
360b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian  vp9_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
361b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
3629b35249446b07f40ac5fcc3205f2c048616efacchkuang#endif  // #if HAVE_DSPR2
363