19b35249446b07f40ac5fcc3205f2c048616efacchkuang/*
29b35249446b07f40ac5fcc3205f2c048616efacchkuang *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
39b35249446b07f40ac5fcc3205f2c048616efacchkuang *
49b35249446b07f40ac5fcc3205f2c048616efacchkuang *  Use of this source code is governed by a BSD-style license
59b35249446b07f40ac5fcc3205f2c048616efacchkuang *  that can be found in the LICENSE file in the root of the source
69b35249446b07f40ac5fcc3205f2c048616efacchkuang *  tree. An additional intellectual property rights grant can be found
79b35249446b07f40ac5fcc3205f2c048616efacchkuang *  in the file PATENTS.  All contributing project authors may
89b35249446b07f40ac5fcc3205f2c048616efacchkuang *  be found in the AUTHORS file in the root of the source tree.
99b35249446b07f40ac5fcc3205f2c048616efacchkuang */
109b35249446b07f40ac5fcc3205f2c048616efacchkuang
119b35249446b07f40ac5fcc3205f2c048616efacchkuang#include <stdlib.h>
129b35249446b07f40ac5fcc3205f2c048616efacchkuang
137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "./vpx_dsp_rtcd.h"
147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx/vpx_integer.h"
157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/mips/common_dspr2.h"
167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian#include "vpx_mem/vpx_mem.h"
209b35249446b07f40ac5fcc3205f2c048616efacchkuang
219b35249446b07f40ac5fcc3205f2c048616efacchkuang#if HAVE_DSPR2
227bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
237bc9febe8749e98a3812a0dc4380ceae75c29450Johann                                const uint8_t *blimit, const uint8_t *limit,
2468e1c830ade592be74773e249bf94e2bbfb50de7Johann                                const uint8_t *thresh) {
257bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t i;
267bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t mask;
277bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t hev;
287bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
297bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
307bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t thresh_vec, flimit_vec, limit_vec;
317bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t uflimit, ulimit, uthresh;
329b35249446b07f40ac5fcc3205f2c048616efacchkuang
339b35249446b07f40ac5fcc3205f2c048616efacchkuang  uflimit = *blimit;
349b35249446b07f40ac5fcc3205f2c048616efacchkuang  ulimit = *limit;
359b35249446b07f40ac5fcc3205f2c048616efacchkuang  uthresh = *thresh;
369b35249446b07f40ac5fcc3205f2c048616efacchkuang
379b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* create quad-byte */
387bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __asm__ __volatile__(
399b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
409b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
419b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
429b35249446b07f40ac5fcc3205f2c048616efacchkuang
437bc9febe8749e98a3812a0dc4380ceae75c29450Johann      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
447bc9febe8749e98a3812a0dc4380ceae75c29450Johann        [limit_vec] "=r"(limit_vec)
457bc9febe8749e98a3812a0dc4380ceae75c29450Johann      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
469b35249446b07f40ac5fcc3205f2c048616efacchkuang
479b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* prefetch data for store */
487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  prefetch_store(s);
499b35249446b07f40ac5fcc3205f2c048616efacchkuang
509b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* loop filter designed to work using chars so that we can make maximum use
519b35249446b07f40ac5fcc3205f2c048616efacchkuang     of 8 bit simd instructions. */
529b35249446b07f40ac5fcc3205f2c048616efacchkuang  for (i = 0; i < 2; i++) {
539b35249446b07f40ac5fcc3205f2c048616efacchkuang    sm1 = s - (pitch << 2);
549b35249446b07f40ac5fcc3205f2c048616efacchkuang    s0 = sm1 + pitch;
559b35249446b07f40ac5fcc3205f2c048616efacchkuang    s1 = s0 + pitch;
569b35249446b07f40ac5fcc3205f2c048616efacchkuang    s2 = s - pitch;
579b35249446b07f40ac5fcc3205f2c048616efacchkuang    s3 = s;
589b35249446b07f40ac5fcc3205f2c048616efacchkuang    s4 = s + pitch;
599b35249446b07f40ac5fcc3205f2c048616efacchkuang    s5 = s4 + pitch;
609b35249446b07f40ac5fcc3205f2c048616efacchkuang    s6 = s5 + pitch;
619b35249446b07f40ac5fcc3205f2c048616efacchkuang
627bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
639b35249446b07f40ac5fcc3205f2c048616efacchkuang        "lw     %[p1],  (%[s1])    \n\t"
649b35249446b07f40ac5fcc3205f2c048616efacchkuang        "lw     %[p2],  (%[s2])    \n\t"
659b35249446b07f40ac5fcc3205f2c048616efacchkuang        "lw     %[p3],  (%[s3])    \n\t"
669b35249446b07f40ac5fcc3205f2c048616efacchkuang        "lw     %[p4],  (%[s4])    \n\t"
679b35249446b07f40ac5fcc3205f2c048616efacchkuang
687bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
697bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
709b35249446b07f40ac5fcc3205f2c048616efacchkuang
719b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
729b35249446b07f40ac5fcc3205f2c048616efacchkuang       mask will be zero and filtering is not needed */
739b35249446b07f40ac5fcc3205f2c048616efacchkuang    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
747bc9febe8749e98a3812a0dc4380ceae75c29450Johann      __asm__ __volatile__(
759b35249446b07f40ac5fcc3205f2c048616efacchkuang          "lw       %[pm1], (%[sm1])   \n\t"
769b35249446b07f40ac5fcc3205f2c048616efacchkuang          "lw       %[p0],  (%[s0])    \n\t"
779b35249446b07f40ac5fcc3205f2c048616efacchkuang          "lw       %[p5],  (%[s5])    \n\t"
789b35249446b07f40ac5fcc3205f2c048616efacchkuang          "lw       %[p6],  (%[s6])    \n\t"
799b35249446b07f40ac5fcc3205f2c048616efacchkuang
807bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
817bc9febe8749e98a3812a0dc4380ceae75c29450Johann          : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
829b35249446b07f40ac5fcc3205f2c048616efacchkuang
837bc9febe8749e98a3812a0dc4380ceae75c29450Johann      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
847bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            p6, thresh_vec, &hev, &mask);
859b35249446b07f40ac5fcc3205f2c048616efacchkuang
869b35249446b07f40ac5fcc3205f2c048616efacchkuang      /* if mask == 0 do filtering is not needed */
879b35249446b07f40ac5fcc3205f2c048616efacchkuang      if (mask) {
889b35249446b07f40ac5fcc3205f2c048616efacchkuang        /* filtering */
897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
909b35249446b07f40ac5fcc3205f2c048616efacchkuang
917bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
929b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sw     %[p1],  (%[s1])    \n\t"
939b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sw     %[p2],  (%[s2])    \n\t"
949b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sw     %[p3],  (%[s3])    \n\t"
959b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sw     %[p4],  (%[s4])    \n\t"
969b35249446b07f40ac5fcc3205f2c048616efacchkuang
979b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
987bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
997bc9febe8749e98a3812a0dc4380ceae75c29450Johann              [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
1009b35249446b07f40ac5fcc3205f2c048616efacchkuang      }
1019b35249446b07f40ac5fcc3205f2c048616efacchkuang    }
1029b35249446b07f40ac5fcc3205f2c048616efacchkuang
1039b35249446b07f40ac5fcc3205f2c048616efacchkuang    s = s + 4;
1049b35249446b07f40ac5fcc3205f2c048616efacchkuang  }
1059b35249446b07f40ac5fcc3205f2c048616efacchkuang}
1069b35249446b07f40ac5fcc3205f2c048616efacchkuang
1077bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
1087bc9febe8749e98a3812a0dc4380ceae75c29450Johann                              const uint8_t *blimit, const uint8_t *limit,
10968e1c830ade592be74773e249bf94e2bbfb50de7Johann                              const uint8_t *thresh) {
1107bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t i;
1117bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t mask, hev;
1127bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
1137bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint8_t *s1, *s2, *s3, *s4;
1147bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
1157bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t thresh_vec, flimit_vec, limit_vec;
1167bc9febe8749e98a3812a0dc4380ceae75c29450Johann  uint32_t uflimit, ulimit, uthresh;
1179b35249446b07f40ac5fcc3205f2c048616efacchkuang
1189b35249446b07f40ac5fcc3205f2c048616efacchkuang  uflimit = *blimit;
1199b35249446b07f40ac5fcc3205f2c048616efacchkuang  ulimit = *limit;
1209b35249446b07f40ac5fcc3205f2c048616efacchkuang  uthresh = *thresh;
1219b35249446b07f40ac5fcc3205f2c048616efacchkuang
1229b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* create quad-byte */
1237bc9febe8749e98a3812a0dc4380ceae75c29450Johann  __asm__ __volatile__(
1249b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
1259b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
1269b35249446b07f40ac5fcc3205f2c048616efacchkuang      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
1279b35249446b07f40ac5fcc3205f2c048616efacchkuang
1287bc9febe8749e98a3812a0dc4380ceae75c29450Johann      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
1297bc9febe8749e98a3812a0dc4380ceae75c29450Johann        [limit_vec] "=r"(limit_vec)
1307bc9febe8749e98a3812a0dc4380ceae75c29450Johann      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
1319b35249446b07f40ac5fcc3205f2c048616efacchkuang
1329b35249446b07f40ac5fcc3205f2c048616efacchkuang  /* prefetch data for store */
1337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  prefetch_store(s + pitch);
1349b35249446b07f40ac5fcc3205f2c048616efacchkuang
1359b35249446b07f40ac5fcc3205f2c048616efacchkuang  for (i = 0; i < 2; i++) {
1369b35249446b07f40ac5fcc3205f2c048616efacchkuang    s1 = s;
1379b35249446b07f40ac5fcc3205f2c048616efacchkuang    s2 = s + pitch;
1389b35249446b07f40ac5fcc3205f2c048616efacchkuang    s3 = s2 + pitch;
1399b35249446b07f40ac5fcc3205f2c048616efacchkuang    s4 = s3 + pitch;
1407bc9febe8749e98a3812a0dc4380ceae75c29450Johann    s = s4 + pitch;
1419b35249446b07f40ac5fcc3205f2c048616efacchkuang
1429b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* load quad-byte vectors
1439b35249446b07f40ac5fcc3205f2c048616efacchkuang     * memory is 4 byte aligned
1449b35249446b07f40ac5fcc3205f2c048616efacchkuang     */
1457bc9febe8749e98a3812a0dc4380ceae75c29450Johann    p2 = *((uint32_t *)(s1 - 4));
1467bc9febe8749e98a3812a0dc4380ceae75c29450Johann    p6 = *((uint32_t *)(s1));
1477bc9febe8749e98a3812a0dc4380ceae75c29450Johann    p1 = *((uint32_t *)(s2 - 4));
1487bc9febe8749e98a3812a0dc4380ceae75c29450Johann    p5 = *((uint32_t *)(s2));
1497bc9febe8749e98a3812a0dc4380ceae75c29450Johann    p0 = *((uint32_t *)(s3 - 4));
1507bc9febe8749e98a3812a0dc4380ceae75c29450Johann    p4 = *((uint32_t *)(s3));
1519b35249446b07f40ac5fcc3205f2c048616efacchkuang    pm1 = *((uint32_t *)(s4 - 4));
1527bc9febe8749e98a3812a0dc4380ceae75c29450Johann    p3 = *((uint32_t *)(s4));
1539b35249446b07f40ac5fcc3205f2c048616efacchkuang
1549b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* transpose pm1, p0, p1, p2 */
1557bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
1569b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
1579b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
1589b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
1599b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
1609b35249446b07f40ac5fcc3205f2c048616efacchkuang
1619b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
1629b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
1639b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1649b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1659b35249446b07f40ac5fcc3205f2c048616efacchkuang
1669b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
1679b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
1689b35249446b07f40ac5fcc3205f2c048616efacchkuang        "append         %[p1],      %[sec3],    16          \n\t"
1699b35249446b07f40ac5fcc3205f2c048616efacchkuang        "append         %[pm1],     %[sec4],    16          \n\t"
1709b35249446b07f40ac5fcc3205f2c048616efacchkuang
1717bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1727bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
1737bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1747bc9febe8749e98a3812a0dc4380ceae75c29450Johann        :);
1759b35249446b07f40ac5fcc3205f2c048616efacchkuang
1769b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* transpose p3, p4, p5, p6 */
1777bc9febe8749e98a3812a0dc4380ceae75c29450Johann    __asm__ __volatile__(
1789b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
1799b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
1809b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
1819b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
1829b35249446b07f40ac5fcc3205f2c048616efacchkuang
1839b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
1849b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
1859b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
1869b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
1879b35249446b07f40ac5fcc3205f2c048616efacchkuang
1889b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
1899b35249446b07f40ac5fcc3205f2c048616efacchkuang        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
1909b35249446b07f40ac5fcc3205f2c048616efacchkuang        "append         %[p5],      %[sec3],    16          \n\t"
1919b35249446b07f40ac5fcc3205f2c048616efacchkuang        "append         %[p3],      %[sec4],    16          \n\t"
1929b35249446b07f40ac5fcc3205f2c048616efacchkuang
1937bc9febe8749e98a3812a0dc4380ceae75c29450Johann        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
1947bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
1957bc9febe8749e98a3812a0dc4380ceae75c29450Johann          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
1967bc9febe8749e98a3812a0dc4380ceae75c29450Johann        :);
1979b35249446b07f40ac5fcc3205f2c048616efacchkuang
1989b35249446b07f40ac5fcc3205f2c048616efacchkuang    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
1999b35249446b07f40ac5fcc3205f2c048616efacchkuang     * mask will be zero and filtering is not needed
2009b35249446b07f40ac5fcc3205f2c048616efacchkuang     */
2019b35249446b07f40ac5fcc3205f2c048616efacchkuang    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
2027bc9febe8749e98a3812a0dc4380ceae75c29450Johann      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
2037bc9febe8749e98a3812a0dc4380ceae75c29450Johann                            p6, thresh_vec, &hev, &mask);
2049b35249446b07f40ac5fcc3205f2c048616efacchkuang
2059b35249446b07f40ac5fcc3205f2c048616efacchkuang      /* if mask == 0 do filtering is not needed */
2069b35249446b07f40ac5fcc3205f2c048616efacchkuang      if (mask) {
2079b35249446b07f40ac5fcc3205f2c048616efacchkuang        /* filtering */
2087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
2099b35249446b07f40ac5fcc3205f2c048616efacchkuang
2109b35249446b07f40ac5fcc3205f2c048616efacchkuang        /* unpack processed 4x4 neighborhood
2119b35249446b07f40ac5fcc3205f2c048616efacchkuang         * don't use transpose on output data
2129b35249446b07f40ac5fcc3205f2c048616efacchkuang         * because memory isn't aligned
2139b35249446b07f40ac5fcc3205f2c048616efacchkuang         */
2147bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
2159b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p4],   1(%[s4])    \n\t"
2169b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p3],   0(%[s4])    \n\t"
2179b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p2],  -1(%[s4])    \n\t"
2189b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p1],  -2(%[s4])    \n\t"
2199b35249446b07f40ac5fcc3205f2c048616efacchkuang
2209b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
2217bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
2227bc9febe8749e98a3812a0dc4380ceae75c29450Johann              [s4] "r"(s4));
2239b35249446b07f40ac5fcc3205f2c048616efacchkuang
2247bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
2259b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p4],  %[p4],  8     \n\t"
2269b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p3],  %[p3],  8     \n\t"
2279b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p2],  %[p2],  8     \n\t"
2289b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p1],  %[p1],  8     \n\t"
2299b35249446b07f40ac5fcc3205f2c048616efacchkuang
2307bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
2317bc9febe8749e98a3812a0dc4380ceae75c29450Johann            :);
2329b35249446b07f40ac5fcc3205f2c048616efacchkuang
2337bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
2349b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p4],   1(%[s3])    \n\t"
2359b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p3],   0(%[s3])    \n\t"
2369b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p2],  -1(%[s3])    \n\t"
2379b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p1],  -2(%[s3])    \n\t"
2389b35249446b07f40ac5fcc3205f2c048616efacchkuang
2397bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p1] "+r"(p1)
2407bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
2419b35249446b07f40ac5fcc3205f2c048616efacchkuang
2427bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
2439b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p4],  %[p4],  8     \n\t"
2449b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p3],  %[p3],  8     \n\t"
2459b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p2],  %[p2],  8     \n\t"
2469b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p1],  %[p1],  8     \n\t"
2479b35249446b07f40ac5fcc3205f2c048616efacchkuang
2487bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
2497bc9febe8749e98a3812a0dc4380ceae75c29450Johann            :);
2509b35249446b07f40ac5fcc3205f2c048616efacchkuang
2517bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
2529b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p4],   1(%[s2])    \n\t"
2539b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p3],   0(%[s2])    \n\t"
2549b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p2],  -1(%[s2])    \n\t"
2559b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p1],  -2(%[s2])    \n\t"
2569b35249446b07f40ac5fcc3205f2c048616efacchkuang
2579b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
2587bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
2597bc9febe8749e98a3812a0dc4380ceae75c29450Johann              [s2] "r"(s2));
2609b35249446b07f40ac5fcc3205f2c048616efacchkuang
2617bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
2629b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p4],  %[p4],  8     \n\t"
2639b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p3],  %[p3],  8     \n\t"
2649b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p2],  %[p2],  8     \n\t"
2659b35249446b07f40ac5fcc3205f2c048616efacchkuang            "srl    %[p1],  %[p1],  8     \n\t"
2669b35249446b07f40ac5fcc3205f2c048616efacchkuang
2677bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
2687bc9febe8749e98a3812a0dc4380ceae75c29450Johann            :);
2699b35249446b07f40ac5fcc3205f2c048616efacchkuang
2707bc9febe8749e98a3812a0dc4380ceae75c29450Johann        __asm__ __volatile__(
2719b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p4],   1(%[s1])    \n\t"
2729b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p3],   0(%[s1])    \n\t"
2739b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p2],  -1(%[s1])    \n\t"
2749b35249446b07f40ac5fcc3205f2c048616efacchkuang            "sb     %[p1],  -2(%[s1])    \n\t"
2759b35249446b07f40ac5fcc3205f2c048616efacchkuang
2769b35249446b07f40ac5fcc3205f2c048616efacchkuang            :
2777bc9febe8749e98a3812a0dc4380ceae75c29450Johann            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
2787bc9febe8749e98a3812a0dc4380ceae75c29450Johann              [s1] "r"(s1));
2799b35249446b07f40ac5fcc3205f2c048616efacchkuang      }
2809b35249446b07f40ac5fcc3205f2c048616efacchkuang    }
2819b35249446b07f40ac5fcc3205f2c048616efacchkuang  }
2829b35249446b07f40ac5fcc3205f2c048616efacchkuang}
283b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2847bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_lpf_horizontal_4_dual_dspr2(
2857bc9febe8749e98a3812a0dc4380ceae75c29450Johann    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
2867bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
2877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *limit1, const uint8_t *thresh1) {
28868e1c830ade592be74773e249bf94e2bbfb50de7Johann  vpx_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
28968e1c830ade592be74773e249bf94e2bbfb50de7Johann  vpx_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
290b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
291b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
2927bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_lpf_horizontal_8_dual_dspr2(
2937bc9febe8749e98a3812a0dc4380ceae75c29450Johann    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
2947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
2957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    const uint8_t *limit1, const uint8_t *thresh1) {
29668e1c830ade592be74773e249bf94e2bbfb50de7Johann  vpx_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
29768e1c830ade592be74773e249bf94e2bbfb50de7Johann  vpx_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
298b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
299b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3007bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
301b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *limit0,
302b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *thresh0,
303b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *blimit1,
304b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *limit1,
305b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *thresh1) {
30668e1c830ade592be74773e249bf94e2bbfb50de7Johann  vpx_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
30768e1c830ade592be74773e249bf94e2bbfb50de7Johann  vpx_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
308b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
309b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3107bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
311b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *limit0,
312b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *thresh0,
313b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *blimit1,
314b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *limit1,
315b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                   const uint8_t *thresh1) {
31668e1c830ade592be74773e249bf94e2bbfb50de7Johann  vpx_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
31768e1c830ade592be74773e249bf94e2bbfb50de7Johann  vpx_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
318b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
319b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian
3207bc9febe8749e98a3812a0dc4380ceae75c29450Johannvoid vpx_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
321b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                    const uint8_t *limit,
322b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian                                    const uint8_t *thresh) {
3237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  vpx_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
3247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian  vpx_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
325b08e2e23eec181e9951df33cd704ac294c5407b6Vignesh Venkatasubramanian}
3269b35249446b07f40ac5fcc3205f2c048616efacchkuang#endif  // #if HAVE_DSPR2
327