17ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;
27ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
37ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;
47ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;  Use of this source code is governed by a BSD-style license
57ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;  that can be found in the LICENSE file in the root of the source
67ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;  tree. An additional intellectual property rights grant can be found
77ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;  in the file PATENTS.  All contributing project authors may
87ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;  be found in the AUTHORS file in the root of the source tree.
97ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;
107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    EXPORT  |vpx_lpf_horizontal_8_neon|
127bc9febe8749e98a3812a0dc4380ceae75c29450Johann    EXPORT  |vpx_lpf_horizontal_8_dual_neon|
137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    EXPORT  |vpx_lpf_vertical_8_neon|
147bc9febe8749e98a3812a0dc4380ceae75c29450Johann    EXPORT  |vpx_lpf_vertical_8_dual_neon|
157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ARM
167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    AREA ||.text||, CODE, READONLY, ALIGN=2
187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; Currently vpx only works on iterations 8 at a time. The vp8 loop filter
207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; works on 16 iterations at a time.
217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;
227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; void vpx_lpf_horizontal_8_neon(uint8_t *s, int p,
237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;                                const uint8_t *blimit,
247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;                                const uint8_t *limit,
2568e1c830ade592be74773e249bf94e2bbfb50de7Johann;                                const uint8_t *thresh)
267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r0    uint8_t *s,
277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r1    int p, /* pitch */
287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r2    const uint8_t *blimit,
297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r3    const uint8_t *limit,
307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; sp    const uint8_t *thresh,
317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian|vpx_lpf_horizontal_8_neon| PROC
327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push        {r4-r5, lr}
337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.8      {d0[]}, [r2]               ; duplicate *blimit
357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ldr         r2, [sp, #12]              ; load thresh
367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    add         r1, r1, r1                 ; double pitch
377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.8      {d1[]}, [r3]               ; duplicate *limit
397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.8      {d2[]}, [r2]               ; duplicate *thresh
407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         r3, r0, r1, lsl #1         ; move src pointer down by 4 lines
427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    add         r2, r3, r1, lsr #1         ; set to 3 lines down
437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d3}, [r3@64], r1          ; p3
457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d4}, [r2@64], r1          ; p2
467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d5}, [r3@64], r1          ; p1
477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d6}, [r2@64], r1          ; p0
487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d7}, [r3@64], r1          ; q0
497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d16}, [r2@64], r1         ; q1
507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d17}, [r3@64]             ; q2
517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d18}, [r2@64], r1         ; q3
527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         r3, r3, r1, lsl #1
547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         r2, r2, r1, lsl #2
557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    bl          vpx_mbloop_filter_neon
577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1.u8     {d0}, [r2@64], r1          ; store op2
597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1.u8     {d1}, [r3@64], r1          ; store op1
607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1.u8     {d2}, [r2@64], r1          ; store op0
617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1.u8     {d3}, [r3@64], r1          ; store oq0
627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1.u8     {d4}, [r2@64], r1          ; store oq1
637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst1.u8     {d5}, [r3@64], r1          ; store oq2
647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop         {r4-r5, pc}
667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ENDP        ; |vpx_lpf_horizontal_8_neon|
687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
697bc9febe8749e98a3812a0dc4380ceae75c29450Johann;void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
707bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                    int p,
717bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                    const uint8_t *blimit0,
727bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                    const uint8_t *limit0,
737bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                    const uint8_t *thresh0,
747bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                    const uint8_t *blimit1,
757bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                    const uint8_t *limit1,
767bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                    const uint8_t *thresh1)
777bc9febe8749e98a3812a0dc4380ceae75c29450Johann; r0      uint8_t *s,
787bc9febe8749e98a3812a0dc4380ceae75c29450Johann; r1      int p, /* pitch */
797bc9febe8749e98a3812a0dc4380ceae75c29450Johann; r2      const uint8_t *blimit0,
807bc9febe8749e98a3812a0dc4380ceae75c29450Johann; r3      const uint8_t *limit0,
817bc9febe8749e98a3812a0dc4380ceae75c29450Johann; sp      const uint8_t *thresh0,
827bc9febe8749e98a3812a0dc4380ceae75c29450Johann; sp + 4  const uint8_t *blimit1,
837bc9febe8749e98a3812a0dc4380ceae75c29450Johann; sp + 8  const uint8_t *limit1,
847bc9febe8749e98a3812a0dc4380ceae75c29450Johann; sp + 12 const uint8_t *thresh1,
857bc9febe8749e98a3812a0dc4380ceae75c29450Johann|vpx_lpf_horizontal_8_dual_neon| PROC
867bc9febe8749e98a3812a0dc4380ceae75c29450Johann    push        {r0-r1, lr}
877bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ldr         lr, [sp, #12]
887bc9febe8749e98a3812a0dc4380ceae75c29450Johann    push        {lr}                       ; thresh0
897bc9febe8749e98a3812a0dc4380ceae75c29450Johann    bl          vpx_lpf_horizontal_8_neon
907bc9febe8749e98a3812a0dc4380ceae75c29450Johann
917bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ldr         r2, [sp, #20]              ; blimit1
927bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ldr         r3, [sp, #24]              ; limit1
937bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ldr         lr, [sp, #28]
947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    str         lr, [sp, #16]              ; thresh1
957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    add         sp, #4
967bc9febe8749e98a3812a0dc4380ceae75c29450Johann    pop         {r0-r1, lr}
977bc9febe8749e98a3812a0dc4380ceae75c29450Johann    add         r0, #8                     ; s + 8
987bc9febe8749e98a3812a0dc4380ceae75c29450Johann    b           vpx_lpf_horizontal_8_neon
997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ENDP        ; |vpx_lpf_horizontal_8_dual_neon|
1007bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; void vpx_lpf_vertical_8_neon(uint8_t *s,
1027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;                              int pitch,
1037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;                              const uint8_t *blimit,
1047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;                              const uint8_t *limit,
10568e1c830ade592be74773e249bf94e2bbfb50de7Johann;                              const uint8_t *thresh)
1067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;
1077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r0    uint8_t *s,
1087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r1    int pitch,
1097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r2    const uint8_t *blimit,
1107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r3    const uint8_t *limit,
1117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; sp    const uint8_t *thresh,
1127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian|vpx_lpf_vertical_8_neon| PROC
1137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    push        {r4-r5, lr}
1147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.8      {d0[]}, [r2]              ; duplicate *blimit
1167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.8      {d1[]}, [r3]              ; duplicate *limit
1177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ldr         r3, [sp, #12]             ; load thresh
1197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         r2, r0, #4                ; move s pointer down by 4 columns
1207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.8      {d2[]}, [r3]              ; duplicate *thresh
1227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d3}, [r2], r1             ; load s data
1247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d4}, [r2], r1
1257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d5}, [r2], r1
1267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d6}, [r2], r1
1277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d7}, [r2], r1
1287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d16}, [r2], r1
1297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d17}, [r2], r1
1307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vld1.u8     {d18}, [r2]
1317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ;transpose to 8x16 matrix
1337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.32     d3, d7
1347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.32     d4, d16
1357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.32     d5, d17
1367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.32     d6, d18
1377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.16     d3, d5
1397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.16     d4, d6
1407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.16     d7, d17
1417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.16     d16, d18
1427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.8      d3, d4
1447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.8      d5, d6
1457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.8      d7, d16
1467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vtrn.8      d17, d18
1477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    sub         r2, r0, #3
1497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    add         r3, r0, #1
1507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    bl          vpx_mbloop_filter_neon
1527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ;store op2, op1, op0, oq0
1547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst4.8      {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
1557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst4.8      {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
1567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst4.8      {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
1577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst4.8      {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
1587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst4.8      {d0[4], d1[4], d2[4], d3[4]}, [r2], r1
1597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst4.8      {d0[5], d1[5], d2[5], d3[5]}, [r2], r1
1607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst4.8      {d0[6], d1[6], d2[6], d3[6]}, [r2], r1
1617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst4.8      {d0[7], d1[7], d2[7], d3[7]}, [r2]
1627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ;store oq1, oq2
1647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst2.8      {d4[0], d5[0]}, [r3], r1
1657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst2.8      {d4[1], d5[1]}, [r3], r1
1667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst2.8      {d4[2], d5[2]}, [r3], r1
1677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst2.8      {d4[3], d5[3]}, [r3], r1
1687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst2.8      {d4[4], d5[4]}, [r3], r1
1697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst2.8      {d4[5], d5[5]}, [r3], r1
1707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst2.8      {d4[6], d5[6]}, [r3], r1
1717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vst2.8      {d4[7], d5[7]}, [r3]
1727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    pop         {r4-r5, pc}
1747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ENDP        ; |vpx_lpf_vertical_8_neon|
1757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
1767bc9febe8749e98a3812a0dc4380ceae75c29450Johann;void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
1777bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                  int pitch,
1787bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                  const uint8_t *blimit0,
1797bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                  const uint8_t *limit0,
1807bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                  const uint8_t *thresh0,
1817bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                  const uint8_t *blimit1,
1827bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                  const uint8_t *limit1,
1837bc9febe8749e98a3812a0dc4380ceae75c29450Johann;                                  const uint8_t *thresh1)
1847bc9febe8749e98a3812a0dc4380ceae75c29450Johann; r0      uint8_t *s,
1857bc9febe8749e98a3812a0dc4380ceae75c29450Johann; r1      int pitch
1867bc9febe8749e98a3812a0dc4380ceae75c29450Johann; r2      const uint8_t *blimit0,
1877bc9febe8749e98a3812a0dc4380ceae75c29450Johann; r3      const uint8_t *limit0,
1887bc9febe8749e98a3812a0dc4380ceae75c29450Johann; sp      const uint8_t *thresh0,
1897bc9febe8749e98a3812a0dc4380ceae75c29450Johann; sp + 4  const uint8_t *blimit1,
1907bc9febe8749e98a3812a0dc4380ceae75c29450Johann; sp + 8  const uint8_t *limit1,
1917bc9febe8749e98a3812a0dc4380ceae75c29450Johann; sp + 12 const uint8_t *thresh1,
1927bc9febe8749e98a3812a0dc4380ceae75c29450Johann|vpx_lpf_vertical_8_dual_neon| PROC
1937bc9febe8749e98a3812a0dc4380ceae75c29450Johann    push        {r0-r1, lr}
1947bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ldr         lr, [sp, #12]
1957bc9febe8749e98a3812a0dc4380ceae75c29450Johann    push        {lr}                       ; thresh0
1967bc9febe8749e98a3812a0dc4380ceae75c29450Johann    bl          vpx_lpf_vertical_8_neon
1977bc9febe8749e98a3812a0dc4380ceae75c29450Johann
1987bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ldr         r2, [sp, #20]              ; blimit1
1997bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ldr         r3, [sp, #24]              ; limit1
2007bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ldr         lr, [sp, #28]
2017bc9febe8749e98a3812a0dc4380ceae75c29450Johann    str         lr, [sp, #16]              ; thresh1
2027bc9febe8749e98a3812a0dc4380ceae75c29450Johann    add         sp, #4
2037bc9febe8749e98a3812a0dc4380ceae75c29450Johann    pop         {r0-r1, lr}
2047bc9febe8749e98a3812a0dc4380ceae75c29450Johann    add         r0, r1, lsl #3             ; s + 8 * pitch
2057bc9febe8749e98a3812a0dc4380ceae75c29450Johann    b           vpx_lpf_vertical_8_neon
2067bc9febe8749e98a3812a0dc4380ceae75c29450Johann    ENDP        ; |vpx_lpf_vertical_8_dual_neon|
2077bc9febe8749e98a3812a0dc4380ceae75c29450Johann
2087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; void vpx_mbloop_filter_neon();
2097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; This is a helper function for the loopfilters. The invidual functions do the
2107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; necessary load, transpose (if necessary) and store. The function does not use
2117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; registers d8-d15.
2127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;
2137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; Inputs:
2147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; r0-r3, r12 PRESERVE
2157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d0    blimit
2167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d1    limit
2177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d2    thresh
2187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d3    p3
2197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d4    p2
2207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d5    p1
2217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d6    p0
2227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d7    q0
2237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d16   q1
2247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d17   q2
2257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d18   q3
2267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian;
2277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; Outputs:
2287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d0    op2
2297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d1    op1
2307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d2    op0
2317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d3    oq0
2327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d4    oq1
2337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian; d5    oq2
2347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian|vpx_mbloop_filter_neon| PROC
2357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; filter_mask
2367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d19, d3, d4                ; m1 = abs(p3 - p2)
2377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d20, d4, d5                ; m2 = abs(p2 - p1)
2387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d21, d5, d6                ; m3 = abs(p1 - p0)
2397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d22, d16, d7               ; m4 = abs(q1 - q0)
2407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d23, d17, d16              ; m5 = abs(q2 - q1)
2417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d24, d18, d17              ; m6 = abs(q3 - q2)
2427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; only compare the largest value to limit
2447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d19, d19, d20              ; m1 = max(m1, m2)
2457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d20, d21, d22              ; m2 = max(m3, m4)
2467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d25, d6, d4                ; m7 = abs(p0 - p2)
2487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d23, d23, d24              ; m3 = max(m5, m6)
2507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d26, d7, d17               ; m8 = abs(q0 - q2)
2527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d19, d19, d20
2547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d24, d6, d7                ; m9 = abs(p0 - q0)
2567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d27, d3, d6                ; m10 = abs(p3 - p0)
2577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d28, d18, d7               ; m11 = abs(q3 - q0)
2587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d19, d19, d23
2607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vabd.u8     d23, d5, d16               ; a = abs(p1 - q1)
2627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
2637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; abs () > limit
2657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vcge.u8     d19, d1, d19
2667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; only compare the largest value to thresh
2687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d25, d25, d26              ; m4 = max(m7, m8)
2697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d26, d27, d28              ; m5 = max(m10, m11)
2707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vshr.u8     d23, d23, #1               ; a = a / 2
2727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d25, d25, d26              ; m4 = max(m4, m5)
2747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqadd.u8    d24, d24, d23              ; a = b + a
2767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmax.u8     d20, d20, d25              ; m2 = max(m2, m4)
2787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmov.u8     d23, #1
2807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vcge.u8     d24, d0, d24               ; a > blimit
2817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vcgt.u8     d21, d21, d2               ; (abs(p1 - p0) > thresh)*-1
2837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vcge.u8     d20, d23, d20              ; flat
2857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vand        d19, d19, d24              ; mask
2877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vcgt.u8     d23, d22, d2               ; (abs(q1 - q0) > thresh)*-1
2897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vand        d20, d20, d19              ; flat & mask
2917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmov.u8     d22, #0x80
2937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vorr        d23, d21, d23              ; hev
2957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
2967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; This instruction will truncate the "flat & mask" masks down to 4 bits
2977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; each to fit into one 32 bit arm register. The values are stored in
2987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; q10.64[0].
2997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vshrn.u16   d30, q10, #4
3007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmov.u32    r4, d30[0]                 ; flat & mask 4bits
3017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    adds        r5, r4, #1                 ; Check for all 1's
3037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; If mask and flat are 1's for all vectors, then we only need to execute
3057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; the power branch for all vectors.
3067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    beq         power_branch_only
3077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    cmp         r4, #0                     ; Check for 0, set flag for later
3097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; mbfilter() function
3117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; filter() function
3127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; convert to signed
3137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d21, d7, d22               ; qs0
3147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d24, d6, d22               ; ps0
3157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d25, d5, d22               ; ps1
3167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d26, d16, d22              ; qs1
3177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmov.u8     d27, #3
3197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsub.s8     d28, d21, d24              ; ( qs0 - ps0)
3217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
3237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
3257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vand        d29, d29, d23              ; filter &= hev
3277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
3297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmov.u8     d29, #4
3317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; filter = clamp(filter + 3 * ( qs0 - ps0))
3337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqmovn.s16  d28, q15
3347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vand        d28, d28, d19              ; filter &= mask
3367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
3387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
3397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vshr.s8     d30, d30, #3               ; filter2 >>= 3
3407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vshr.s8     d29, d29, #3               ; filter1 >>= 3
3417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
3437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqsub.s8    d21, d21, d29              ; oq0 = clamp(qs0 - filter1)
3447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; outer tap adjustments: ++filter1 >> 1
3467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vrshr.s8    d29, d29, #1
3477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbic        d29, d29, d23              ; filter &= ~hev
3487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
3507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
3517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; If mask and flat are 0's for all vectors, then we only need to execute
3537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; the filter branch for all vectors.
3547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    beq         filter_branch_only
3557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; If mask and flat are mixed then we must perform both branches and
3577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; combine the data.
3587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d24, d24, d22              ; *f_op0 = u^0x80
3597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d21, d21, d22              ; *f_oq0 = u^0x80
3607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d25, d25, d22              ; *f_op1 = u^0x80
3617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d26, d26, d22              ; *f_oq1 = u^0x80
3627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; At this point we have already executed the filter branch. The filter
3647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; branch does not set op2 or oq2, so use p2 and q2. Execute the power
3657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; branch and combine the data.
3667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmov.u8     d23, #2
3677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddl.u8    q14, d6, d7                ; r_op2 = p0 + q0
3687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmlal.u8    q14, d3, d27               ; r_op2 += p3 * 3
3697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmlal.u8    q14, d4, d23               ; r_op2 += p2 * 2
3707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbif        d0, d4, d20                ; op2 |= p2 & ~(flat & mask)
3727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d5                    ; r_op2 += p1
3747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbif        d1, d25, d20               ; op1 |= f_op1 & ~(flat & mask)
3767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d30, q14, #3               ; r_op2
3787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d3                    ; r_op1 = r_op2 - p3
3807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d4                    ; r_op1 -= p2
3817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d5                    ; r_op1 += p1
3827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d16                   ; r_op1 += q1
3837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbif        d2, d24, d20               ; op0 |= f_op0 & ~(flat & mask)
3857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d31, q14, #3               ; r_op1
3877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d3                    ; r_op0 = r_op1 - p3
3897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d5                    ; r_op0 -= p1
3907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d6                    ; r_op0 += p0
3917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d17                   ; r_op0 += q2
3927ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3937ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbit        d0, d30, d20               ; op2 |= r_op2 & (flat & mask)
3947ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3957ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d23, q14, #3               ; r_op0
3967ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
3977ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d3                    ; r_oq0 = r_op0 - p3
3987ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d6                    ; r_oq0 -= p0
3997ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d7                    ; r_oq0 += q0
4007ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4017ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbit        d1, d31, d20               ; op1 |= r_op1 & (flat & mask)
4027ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4037ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d18                   ; oq0 += q3
4047ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4057ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbit        d2, d23, d20               ; op0 |= r_op0 & (flat & mask)
4067ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4077ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d22, q14, #3               ; r_oq0
4087ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4097ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d4                    ; r_oq1 = r_oq0 - p2
4107ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d7                    ; r_oq1 -= q0
4117ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d16                   ; r_oq1 += q1
4127ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4137ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbif        d3, d21, d20               ; oq0 |= f_oq0 & ~(flat & mask)
4147ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4157ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d18                   ; r_oq1 += q3
4167ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4177ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbif        d4, d26, d20               ; oq1 |= f_oq1 & ~(flat & mask)
4187ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4197ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d6, q14, #3                ; r_oq1
4207ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4217ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d5                    ; r_oq2 = r_oq1 - p1
4227ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d16                   ; r_oq2 -= q1
4237ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d17                   ; r_oq2 += q2
4247ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d18                   ; r_oq2 += q3
4257ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4267ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbif        d5, d17, d20               ; oq2 |= q2 & ~(flat & mask)
4277ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4287ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d7, q14, #3                ; r_oq2
4297ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4307ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbit        d3, d22, d20               ; oq0 |= r_oq0 & (flat & mask)
4317ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbit        d4, d6, d20                ; oq1 |= r_oq1 & (flat & mask)
4327ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vbit        d5, d7, d20                ; oq2 |= r_oq2 & (flat & mask)
4337ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4347ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    bx          lr
4357ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4367ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianpower_branch_only
4377ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmov.u8     d27, #3
4387ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmov.u8     d21, #2
4397ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddl.u8    q14, d6, d7                ; op2 = p0 + q0
4407ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmlal.u8    q14, d3, d27               ; op2 += p3 * 3
4417ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vmlal.u8    q14, d4, d21               ; op2 += p2 * 2
4427ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d5                    ; op2 += p1
4437ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d0, q14, #3                ; op2
4447ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4457ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d3                    ; op1 = op2 - p3
4467ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d4                    ; op1 -= p2
4477ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d5                    ; op1 += p1
4487ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d16                   ; op1 += q1
4497ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d1, q14, #3                ; op1
4507ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4517ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d3                    ; op0 = op1 - p3
4527ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d5                    ; op0 -= p1
4537ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d6                    ; op0 += p0
4547ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d17                   ; op0 += q2
4557ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d2, q14, #3                ; op0
4567ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4577ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d3                    ; oq0 = op0 - p3
4587ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d6                    ; oq0 -= p0
4597ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d7                    ; oq0 += q0
4607ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d18                   ; oq0 += q3
4617ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d3, q14, #3                ; oq0
4627ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4637ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d4                    ; oq1 = oq0 - p2
4647ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d7                    ; oq1 -= q0
4657ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d16                   ; oq1 += q1
4667ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d18                   ; oq1 += q3
4677ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d4, q14, #3                ; oq1
4687ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4697ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d5                    ; oq2 = oq1 - p1
4707ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vsubw.u8    q14, d16                   ; oq2 -= q1
4717ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d17                   ; oq2 += q2
4727ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vaddw.u8    q14, d18                   ; oq2 += q3
4737ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vqrshrn.u16 d5, q14, #3                ; oq2
4747ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4757ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    bx          lr
4767ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4777ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanianfilter_branch_only
4787ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; TODO(fgalligan): See if we can rearange registers so we do not need to
4797ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ; do the 2 vswp.
4807ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vswp        d0, d4                      ; op2
4817ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    vswp        d5, d17                     ; oq2
4827ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d2, d24, d22                ; *op0 = u^0x80
4837ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d3, d21, d22                ; *oq0 = u^0x80
4847ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d1, d25, d22                ; *op1 = u^0x80
4857ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    veor        d4, d26, d22                ; *oq1 = u^0x80
4867ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4877ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    bx          lr
4887ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4897ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    ENDP        ; |vpx_mbloop_filter_neon|
4907ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian
4917ce0a1d1337c01056ba24006efab21f00e179e04Vignesh Venkatasubramanian    END
492