10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  ihevc_sao_edge_offset_class1.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  Contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* RVCT
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:author
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  Parthiban V
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:par List of Functions:
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:remarks
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  None
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 src_strd,
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_left,
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top,
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top_left,
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top_right,
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_bot_left,
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_avail,
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD8 *pi1_sao_offset,
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 wd,
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 ht)
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************Variables Vs Registers*****************************************
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r0 =>  *pu1_src
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r1 =>  src_strd
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r2 =>  *pu1_src_left
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r3 =>  *pu1_src_top
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r4 =>  *pu1_src_top_left
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r5 =>  *pu1_avail
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r6 =>  *pi1_sao_offset
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r7 =>  wd
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r8 =>  ht
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gi1_table_edge_idx
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class1_a9q
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakargi1_table_edge_idx_addr:
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.long gi1_table_edge_idx - ulbl1 - 8
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class1_a9q:
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STMFD       sp!, {r4-r12, r14}          @stack stores the values of the arguments
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#60]                 @Loads wd
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r4,[sp,#40]                 @Loads pu1_src_top_left
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#52]                 @Loads pu1_avail
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r6,[sp,#56]                 @Loads pi1_sao_offset
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r8,[sp,#64]                 @Loads ht
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r9,r7,#1                    @wd - 1
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r10,[r4]                    @*pu1_src_top_left = pu1_src_top[wd - 1]
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r10,r0,r9                   @pu1_src[row * src_strd + wd - 1]
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r11,r2                      @Move pu1_src_left pointer to r11
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r12,r8                      @Move ht to r12 for loop count
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP:
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r14,[r10],r1                @Load pu1_src[row * src_strd + wd - 1]
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r14,[r11],#1                @pu1_src_left[row]
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r12,#1                      @Decrement the loop count
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_LEFT_LOOP               @If not equal to 0 jump to the src_left_loop
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r12,r8,#1                   @ht - 1
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MUL         r12,r12,r1                  @(ht - 1) * src_strd
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r12,r12,r0                  @pu1_src[(ht - 1) * src_strd]
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r4,[r5,#2]                  @pu1_avail[2]
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r4,#0                       @0 == pu1_avail[2]
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r8,r8,#1                    @ht--
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r4,[r5,#3]                  @pu1_avail[3]
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r4,#0                       @0 == pu1_avail[3]
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r8,r8,#1                    @ht--
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r14, gi1_table_edge_idx_addr @table pointer
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarulbl1:
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r14,pc
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D6,[r14]                    @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,#16                      @Compare wd with 16
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16:
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r4,[r5,#2]                  @pu1_avail[2]
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r4,#0                       @0 == pu1_avail[2]
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r9,r0,r1                    @pu1_src -= src_strd
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVNE       r9,r3                       @*pu1_src_top
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r10,r0                      @*pu1_src
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D8,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D9,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D30,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D31,[r12]!                  @vld1q_u8(pu1_src[(ht - 1) * src_strd])
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q6,Q5,Q4                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q15},[r3]!                 @vst1q_u8(pu1_src_top[col])
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q7,Q5,Q4                    @vcltq_u8(pu1_cur_row, pu1_top_row)
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r11,r8                      @move ht to r11 for loop count
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP:
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r10,r10,r1                  @*pu1_src + src_strd
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,#8
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r6,#8
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,r10,r1
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q14,D19                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_top_row)
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_top_row)
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q4,Q12,Q11                  @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D13,{D6},D13                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q8,Q4                       @II sign_up = vnegq_s8(sign_down)
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q11,Q11,Q4                  @II edge_idx = vaddq_s8(edge_idx, sign_down)
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D23,{D6},D23                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q4,D11                      @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D13,{D7},D13                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q4,Q4,D13                   @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q4,Q4,Q1                    @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q4,Q4,Q2                    @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D25,{D7},D23                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D21,Q4                      @vmovn_s16(pi2_tmp_cur_row.val[1])
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q14,Q14,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q10},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r11,r11,#2                  @II Decrement the ht loop count by 1
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D31,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q15},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         PU1_SRC_LOOP_END            @if 0 == pu1_avail[3] || 0 == pu1_avail[2] ht = ht--
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r11,#1                      @checking any residue remains
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r10,r10,r1                  @*pu1_src + src_strd
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,#8
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_top_row)
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,r10,r1
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D23,{D6},D23                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D25,{D7},D23                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q14,D11                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q14,Q14,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D31,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[1])
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q15},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_END:
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV        Q5,Q9                       @pu1_cur_row = pu1_next_row
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r7,r7,#16                   @Decrement the wd loop count by 16
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,#8                       @Check whether residue remains
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         END_LOOPS                   @Jump to end function
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE:
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r4,[r5,#2]                  @pu1_avail[2]
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r4,#0                       @0 == pu1_avail[2]
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r9,r0,r1                    @pu1_src -= src_strd
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVNE       r9,r3                       @*pu1_src_top
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r10,r0
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D8,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D9,[r9]!                    @pu1_top_row = vld1q_u8(pu1_src_top_cpy || pu1_src - src_strd)
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D10,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D11,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D30,[r12]                   @vld1_u8(pu1_src[(ht - 1) * src_strd])
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {D30},[r3]                  @vst1_u8(pu1_src_top[col])
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q6,Q5,Q4                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q7,Q5,Q4                    @vcltq_u8(pu1_cur_row, pu1_top_row)
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q8,Q7,Q6                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r11,r8                      @move ht to r11 for loop count
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE:
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r10,r10,r1                  @*pu1_src + src_strd
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,#8
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r6,r10,r1                   @II Iteration *pu1_src + src_strd
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D30,[r6]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D31,[r6]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r6,#8
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q7,Q5,Q9                    @vcltq_u8(pu1_cur_row, pu1_next_row)
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,r10,r1
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q13,D18                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q6,Q0,Q8                    @edge_idx = vaddq_s8(const_2, sign_up)
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q11,Q9,Q15                  @II vcgtq_u8(pu1_cur_row, pu1_next_row)
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q6,Q6,Q10                   @edge_idx = vaddq_s8(edge_idx, sign_down)
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q12,Q9,Q15                  @II vcltq_u8(pu1_cur_row, pu1_next_row)
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q8,Q10                      @sign_up = vnegq_s8(sign_down)
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D12,{D6},D12                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q10,Q12,Q11                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q11,Q0,Q8                   @II edge_idx = vaddq_s8(const_2, sign_up)
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D12,{D7},D12                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q8,Q10                      @II sign_up = vnegq_s8(sign_down)
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q11,Q11,Q10                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q10,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q10,Q10,D12                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D22,{D6},D22                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D7},D22                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q13,Q13,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV        Q5,Q15                      @II pu1_cur_row = pu1_next_row
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {D20},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D30,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r11,r11,#2                  @Decrement the ht loop count by 1
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {D30},[r10],r1              @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         END_LOOPS
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r11,#1
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         PU1_SRC_LOOP_RESIDUE        @If not equal jump to PU1_SRC_LOOP
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r10,r10,r1                  @*pu1_src + src_strd
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D18,[r10]!                  @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D19,[r10]                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,#8
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q6,Q5,Q9                    @vcgtq_u8(pu1_cur_row, pu1_next_row)
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q7,Q9,Q5                    @vcltq_u8(pu1_cur_row, pu1_next_row)
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q10,Q7,Q6                   @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,r10,r1
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q11,Q0,Q8                   @edge_idx = vaddq_s8(const_2, sign_up)
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q11,Q11,Q10                 @edge_idx = vaddq_s8(edge_idx, sign_down)
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D22,{D6},D22                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D7},D22                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q13,D10                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q13,Q13,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q13,Q13,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q13,Q13,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D30,Q13                     @vmovn_s16(pi2_tmp_cur_row.val[0])
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {D30},[r10],r1              @vst1q_u8(pu1_src_cpy, pu1_cur_row)
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS:
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
372