10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  ihevc_sao_edge_offset_class0.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  Contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* RVCT
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:author
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  Parthiban V
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:par List of Functions:
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:remarks
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  None
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 src_strd,
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_left,
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top,
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top_left,
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top_right,
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_bot_left,
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_avail,
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD8 *pi1_sao_offset,
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 wd,
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 ht)
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************Variables Vs Registers*****************************************
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r0 =>  *pu1_src
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r1 =>  src_strd
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r2 =>  *pu1_src_left
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r3 =>  *pu1_src_top
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r4 =>  *pu1_src_top_left
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r7 =>  *pu1_avail
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r8 =>  *pi1_sao_offset
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r9 =>  wd
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r10=>  ht
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
62b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ    pu1_src_top_left_offset,    104
63b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ    pu1_src_top_right_offset,   108
64b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ    pu1_src_bot_left_offset,    112
65b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ    pu1_avail_offset,           116
66b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ    pi1_sao_offset,             120
67b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ    wd_offset,                  124
68b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar.equ    ht_offset,                  128
69b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gi1_table_edge_idx
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class0_a9q
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakargi1_table_edge_idx_addr:
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.long gi1_table_edge_idx - ulbl1 - 8
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class0_a9q:
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
83b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar    vpush       {d8  -  d15}
84b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar
85b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar    LDR         r9,[sp,#wd_offset]          @Loads wd
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
87b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar    LDR         r4,[sp,#pu1_src_top_left_offset]    @Loads pu1_src_top_left
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I8     Q1,#2                       @const_2 = vdupq_n_s8(2)
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r11,r3,r9                   @pu1_src_top[wd]
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
91b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar    LDR         r10,[sp,#ht_offset]         @Loads ht
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I16    Q2,#0                       @const_min_clip = vdupq_n_s16(0)
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r12,[r11,#-1]               @pu1_src_top[wd - 1]
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
95b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar    LDR         r7,[sp,#pu1_avail_offset]   @Loads pu1_avail
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I16    Q3,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r14, gi1_table_edge_idx_addr @table pointer
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarulbl1:
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r14,pc
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
101b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar    LDR         r8,[sp,#pi1_sao_offset]     @Loads pi1_sao_offset
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r12,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r6,r0                       @pu1_src_org
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D10,[r14]                   @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r4,r10,#1                   @(ht - 1)
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r12,r9                      @Move wd to r12 for loop count
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D11,[r8]                    @offset_tbl = vld1_s8(pi1_sao_offset)
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MUL         r4,r4,r1                    @(ht - 1) * src_strd
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r4,r4,r0                    @pu1_src[(ht - 1) * src_strd]
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_TOP_LOOP:                               @wd is always multiple of 8
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D0,[r4]!                    @Load pu1_src[(ht - 1) * src_strd + col]
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r12,r12,#8                  @Decrement the loop counter by 8
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      D0,[r3]!                    @Store to pu1_src_top[col]
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_TOP_LOOP
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r6,r6,#15                   @pu1_src_org[16 - 1]
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r9,#16                      @Compare wd with 16
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r3,r2                       @pu1_src_left backup to reload later
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r8,r9                       @move wd to r8 for loop count
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16:
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,r9                       @if(col == wd)
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         AU1_MASK_FF                 @jump to else part
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r12,[r7]                    @pu1_avail[0]
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SKIP_AU1_MASK_FF            @Skip the else part
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_MASK_FF:
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r12,#0xFF                   @move -1 to r12
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_FF:
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#16                      @If col == 16
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SKIP_MASKING_IF_NOT16       @If not skip masking
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r12,[r7,#1]                 @pu1_avail[1]
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D9[7],r12                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_MASKING_IF_NOT16:
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r12,r0                      @pu1_src_cpy = pu1_src
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r4,r10                      @move ht to r4 for loop count
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP:
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r2]                    @load pu1_src_left since ht - row =0 when it comes first pu1_src_left is incremented later
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D13,[r12], r1               @pu1_cur_row = vld1q_u8(pu1_src_cpy)
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r12,#8
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r9,r8                    @wd - col
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r14,r10,r4                  @ht - row
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D15[7],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MUL         r14,r14,r1                  @(ht - row) * src_strd
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D26,[r12]!                  @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D27,[r12]                   @II Iteration pu1_cur_row = vld1q_u8(pu1_src_cpy)
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r12,#8
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q7,Q6,#15                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,r14,r5                   @(ht - row) * src_strd + (wd - col)
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r2, #1]                @II Iteration load pu1_src_left since ht - row + 1 =1
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r14,[r6,r5]                 @pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r4,r4,#1
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D29[7],r11                  @II Iteration vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r12,r12,r1                  @Decrement the pu1_src pointer by src_strd
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.I8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r14,[r2],#1                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q14,Q14,Q13,#15             @II Iteration pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r9,r8                    @II wd - col
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r12,r12,r1                  @Increment the pu1_src pointer by src_strd
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q15,Q13,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r12,#16]               @II pu1_src_cpy[16]
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q6,Q7,#1                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r14,r10,r4                  @II ht - row
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q0,Q13,Q14                  @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D28[0],r11                  @II pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r12,r12,r1                  @Decrement the pu1_src pointer by src_strd
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MUL         r14,r14,r1                  @II (ht - row) * src_strd
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,r14,r5                   @II (ht - row) * src_strd + (wd - col)
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q14,Q13,Q14,#1              @II pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r14,[r6,r5]                 @II pu1_src_org[(ht - row) * src_strd + 16 - 1 + (wd - col)]
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.I8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r4,r4,#1                    @Decrement row by 1
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q7,Q1,Q10                   @edge_idx = vaddq_s8(const_2, sign_left)
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r14,[r2],#1                 @II pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q7,Q7,Q11                   @edge_idx = vaddq_s8(edge_idx, sign_right)
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q9,D12                      @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.I8     Q10,Q0,Q15                  @II sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D14,{D10},D14               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q15,Q13,Q14                 @II vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q0,Q13,Q14                  @II vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D15,{D10},D15               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.I8     Q11,Q0,Q15                  @II sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q7,Q7,Q4                    @edge_idx = vandq_s8(edge_idx, au1_mask)
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D16,{D11},D14               @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q0,D26                      @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q14,Q1,Q10                  @II edge_idx = vaddq_s8(const_2, sign_left)
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q14,Q14,Q11                 @II edge_idx = vaddq_s8(edge_idx, sign_right)
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q9,Q9,D16                   @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D28,{D10},D28               @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q9,Q9,Q2                    @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D29,{D10},D29               @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q9,Q9,Q3                    @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q14,Q14,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D17,{D11},D15               @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q7,D13                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D30,{D11},D28               @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q7,Q7,D17                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q7,Q7,Q2                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D31,{D11},D29               @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q7,Q7,Q3                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D18,Q9                      @vmovn_s16(pi2_tmp_cur_row.val[0])
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q0,Q0,D30                   @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D19,Q7                      @vmovn_s16(pi2_tmp_cur_row.val[1])
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q0,Q0,Q2                    @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q14,D27                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q0,Q0,Q3                    @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D0,Q0                       @II vmovn_s16(pi2_tmp_cur_row.val[0])
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q14,Q14,D31                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {D18,D19},[r12],r1          @vst1q_u8(pu1_src_cpy, pu1_cur_row)
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q14,Q14,Q3                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D1,Q14                      @II vmovn_s16(pi2_tmp_cur_row.val[1])
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {D0,D1},[r12],r1            @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         PU1_SRC_LOOP                @If not equal jump to the inner loop
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r0,r0,#16                   @pu1_src += 16
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r8,r8,#16                   @Decrement column by 16
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#8                       @Check whether residue remains
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r2,r3                       @Reload pu1_src_left
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         END_LOOPS                   @Jump to end function
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE:
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r6,r6,#15
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    AND         r8,r9,#0xF                  @wd_rem = wd & 0xF
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#0                       @Residue check
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         END_LOOPS                   @No Residue jump to end function
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,r9                       @if(wd_rem == wd)
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         AU1_MASK_FF_RESIDUE         @jump to else part
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r12,[r7]                    @pu1_avail[0]
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D8[0],r12                   @vsetq_lane_s8(pu1_avail[0], au1_mask, 0)
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           SKIP_AU1_MASK_FF_RESIDUE    @Skip the else part
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_MASK_FF_RESIDUE:
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r12,#0xFF                   @move -s to r12
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D8[0],r12                   @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_FF_RESIDUE:
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r7,#1]                 @pu1_avail[1]
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r9,#1                    @wd - 1
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r4,r10                      @move ht to r4 for loop count
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D8[7],r11                   @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r12,r0                      @pu1_src_cpy = pu1_src
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE:
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D12,[r12]!                  @pu1_cur_row = vld1q_u8(pu1_src_cpy)
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D13,[r12]                   @pu1_cur_row = vld1q_u8(pu1_src_cpy)
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r12,#8
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r2]                    @load pu1_src_left
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D15[7],r11                  @vsetq_lane_u8(pu1_src_left[ht - row], pu1_cur_row_tmp, 15)
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q7,Q6,#15                @pu1_cur_row_tmp = vextq_u8(pu1_cur_row_tmp, pu1_cur_row, 15)
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.I8     Q10,Q9,Q8                   @sign_left = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r12,#16]               @pu1_src_cpy[16]
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D14[0],r11                  @pu1_cur_row_tmp = vsetq_lane_u8(pu1_src_cpy[16], pu1_cur_row_tmp, 0)
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q6,Q7,#1                 @pu1_cur_row_tmp = vextq_u8(pu1_cur_row, pu1_cur_row_tmp, 1)
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q8,Q6,Q7                    @vcgtq_u8(pu1_cur_row, pu1_cur_row_tmp)
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q9,Q6,Q7                    @vcltq_u8(pu1_cur_row, pu1_cur_row_tmp)
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.I8     Q11,Q9,Q8                   @sign_right = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q12,Q1,Q10                  @edge_idx = vaddq_s8(const_2, sign_left)
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q12,Q12,Q11                 @edge_idx = vaddq_s8(edge_idx, sign_right)
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D10},D24               @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D25,{D10},D25               @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q12,Q12,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q10,Q11                     @sign_left = vnegq_s8(sign_right)
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q10,Q10,Q11,#15             @sign_left = vextq_s8(sign_left, sign_left, 15)
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D26,{D11},D24               @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q14,Q14,D26                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q14,Q14,Q3                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r14,r10,r4                  @ht - row
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MUL         r14,r14,r1                  @(ht - row) * src_strd
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r11,r14,r5                  @(ht - row) * src_strd + (wd - 1)
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r14,[r6, r11]               @pu1_src_org[(ht - row) * src_strd + (wd - 1)]
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r14,[r2],#1                 @pu1_src_left[(ht - row)] = au1_src_left_tmp[(ht - row)]
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {D28},[r12],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r4,r4,#1                    @Decrement row by 1
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         PU1_SRC_LOOP_RESIDUE        @If not equal jump to the pu1_src loop
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS:
350b686bb2df155fd1f55220d56f38cc0033afe278cRakesh Kumar    vpop        {d8  -  d15}
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
356