10d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/*****************************************************************************
20d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
30d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
40d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
50d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Licensed under the Apache License, Version 2.0 (the "License");
60d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* you may not use this file except in compliance with the License.
70d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* You may obtain a copy of the License at:
80d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
90d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* http://www.apache.org/licenses/LICENSE-2.0
100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Unless required by applicable law or agreed to in writing, software
120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* distributed under the License is distributed on an "AS IS" BASIS,
130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* See the License for the specific language governing permissions and
150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* limitations under the License.
160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*****************************************************************************/
180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@/**
190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:file
210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  ihevc_sao_edge_offset_class3.s
220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:brief
240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  Contains function definitions for inter prediction  interpolation.
250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* Functions are coded using NEON  intrinsics and can be compiled using@ ARM
260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* RVCT
270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:author
290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  Parthiban V
300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:par List of Functions:
320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@* ,:remarks
350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*  None
360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*
370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*******************************************************************************
380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@*/
390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 src_strd,
410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_left,
420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top,
430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top_left,
440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_top_right,
450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_src_bot_left,
460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              UWORD8 *pu1_avail,
470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD8 *pi1_sao_offset,
480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 wd,
490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@                              WORD32 ht)
500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@**************Variables Vs Registers*****************************************
510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r0 =>  *pu1_src
520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r1 =>  src_strd
530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r2 =>  *pu1_src_left
540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r3 =>  *pu1_src_top
550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r4 =>  *pu1_src_top_left
560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r5 =>  *pu1_avail
570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r6 =>  *pi1_sao_offset
580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r7 =>  wd
590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar@r8=>   ht
600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.text
620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.p2align 2
630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.extern gi1_table_edge_idx
650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.globl ihevc_sao_edge_offset_class3_a9q
660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakargi1_table_edge_idx_addr_1:
680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.long gi1_table_edge_idx - ulbl1 - 8
690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakargi1_table_edge_idx_addr_2:
710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.long gi1_table_edge_idx - ulbl2 - 8
720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakargi1_table_edge_idx_addr_3:
740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar.long gi1_table_edge_idx - ulbl3 - 8
750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarihevc_sao_edge_offset_class3_a9q:
770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STMFD       sp!,{r4-r12,r14}            @stack stores the values of the arguments
800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0x3C]               @Loads wd
810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r8,[sp,#0x40]               @Loads ht
830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r9,r7,#1                    @wd - 1
840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r4,[sp,#0x28]               @Loads pu1_src_top_left
860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r10,[r3,r9]                 @pu1_src_top[wd - 1]
870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r9,r7                       @Move width to r9 for loop count
890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0x34]               @Loads pu1_avail
910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r6,[sp,#0x38]               @Loads pi1_sao_offset
920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STR         r3,[sp,#0x38]               @Store pu1_src_top in sp
930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         sp,sp,#0x94                 @Decrement the stack pointer to store some temp arr values
950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r10,[sp]                    @u1_src_top_left_tmp = pu1_src_top[wd - 1]
970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,r8,#1                   @ht-1
980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MLA         r11,r10,r1,r0               @pu1_src[(ht - 1) * src_strd + col]
990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r12,sp,#0x02                @temp array
1000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_SRC_TOP_LOOP:
1020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D0,[r11]!                   @pu1_src[(ht - 1) * src_strd + col]
1030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r9,r9,#8                    @Decrement the loop count by 8
1040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      D0,[r12]!                   @au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col]
1050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         AU1_SRC_TOP_LOOP
1060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_5_LOOP:
1080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r9,[r5,#5]                  @pu1_avail[5]
1090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r9,#0
1100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,r7,#1                   @[wd - 1]
1110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r9,[r0,r10]                 @u1_pos_0_0_tmp = pu1_src[wd - 1]
1120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         PU1_AVAIL_6_LOOP
1130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r11,[sp,#0xC0]              @Load pu1_src_top_right from sp
1150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r10,r10,#1                  @[wd - 1 - 1]
1160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r11]                   @pu1_src_top_right[0]
1180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r12,r9,r11                  @pu1_src[wd - 1] - pu1_src_top_right[0]
1190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r11,r0,r1                   @pu1_src + src_strd
1210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r14,[r11,r10]               @pu1_src[wd - 1 - 1 + src_strd]
1230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r12,#0
1240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r12,#0
1250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r11,r9,r14                  @pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]
1260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r12,#1                      @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0])
1280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r11,#0
1290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r11,#0
1300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r11,#1                      @SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
1310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r14, gi1_table_edge_idx_addr_1 @table pointer
1320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarulbl1:
1330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r14,pc
1340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r11,r12,r11                 @SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +  SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd])
1350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r11,r11,#2                  @edge_idx
1360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
1380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r12,#0                      @0 != edge_idx
1390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         PU1_AVAIL_6_LOOP
1400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRSB       r10,[r6,r12]                @pi1_sao_offset[edge_idx]
1410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r9,r9,r10                   @pu1_src[0] + pi1_sao_offset[edge_idx]
1420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    USAT        r9,#8,r9                    @u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
1430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_6_LOOP:
1450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r10,[r5,#6]                 @pu1_avail[6]
1460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r11,r8,#1                   @ht - 1
1470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r10,#0
1490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STR         r0,[sp,#0xC0]               @Store pu1_src in sp
1500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MLA         r12,r11,r1,r0               @pu1_src[(ht - 1) * src_strd]
1510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r10,[r12]                   @u1_pos_wd_ht_tmp = pu1_src[(ht - 1) * src_strd]
1530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         PU1_AVAIL_3_LOOP
1540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r14,[sp,#0xC4]              @Load pu1_src_bot_left from sp
1560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r11,r12,r1                  @pu1_src[(ht - 1) * src_strd) - src_strd]
1570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r14,[r14]                   @Load pu1_src_bot_left[0]
1590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r11,r11,#1                  @pu1_src[(ht - 1) * src_strd + 1 - src_strd]
1600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r11]                   @Load pu1_src[(ht - 1) * src_strd + 1 - src_strd]
1620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r14,r10,r14                 @pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]
1630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r11,r10,r11                 @pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]
1650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r11,#0
1660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r11,#0
1670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r11,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd])
1680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r14,#0
1700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r14,#0
1710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r14,#1                      @SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0])
1720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r11,r11,r14                 @Add 2 sign value
1740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r14, gi1_table_edge_idx_addr_2 @table pointer
1760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarulbl2:
1770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r14,r14,pc
1780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r11,r11,#2                  @edge_idx
1790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRSB       r12,[r14,r11]               @edge_idx = gi1_table_edge_idx[edge_idx]
1810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r12,#0
1820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         PU1_AVAIL_3_LOOP
1830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRSB       r11,[r6,r12]                @pi1_sao_offset[edge_idx]
1840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r10,r10,r11                 @pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx]
1850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    USAT        r10,#8,r10                  @u1_pos_wd_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1)
1860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_3_LOOP:
1880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STR         r2,[sp,#0xC4]               @Store pu1_src_left in sp
1890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r12,r8                      @Move ht
1900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r14,r2                      @Move pu1_src_left to pu1_src_left_cpy
1920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I8     Q0,#2                       @const_2 = vdupq_n_s8(2)
1930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r5,#3]                 @pu1_avail[3]
1940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r11,#0
1960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I16    Q1,#0                       @const_min_clip = vdupq_n_s16(0)
1970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r12,r12,#1                  @ht_tmp--
1980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
1990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#2]                  @pu1_avail[2]
2000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I16    Q2,#255                     @const_max_clip = vdupq_n_u16((1 << bit_depth) - 1)
2010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r5,#0
2020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADDEQ       r0,r0,r1                    @pu1_src += src_strd
2040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D7,[r6]                     @offset_tbl = vld1_s8(pi1_sao_offset)
2050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r12,r12,#1                  @ht_tmp--
2060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r6, gi1_table_edge_idx_addr_3 @table pointer
2080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakarulbl3:
2090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    add         r6,r6,pc
2100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.S8     Q4,#0xFF                    @au1_mask = vdupq_n_s8(-1)
2110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADDEQ       r14,r14,#1                  @pu1_src_left_cpy += 1
2120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STR         r0,[sp,#0x90]               @Store pu1_src in sp
2140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D6,[r6]                     @edge_idx_tbl = vld1_s8(gi1_table_edge_idx)
2150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r6,r7                       @move wd to r6 loop_count
2160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,#16                      @Compare wd with 16
2180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         WIDTH_RESIDUE               @If not jump to WIDTH_RESIDUE where loop is unrolled for 8 case
2190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#4                       @Compare ht with 4
2200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLE         WD_16_HT_4_LOOP             @If jump to WD_16_HT_4_LOOP
2210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_LOOP_16:
2230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0xD0]               @Loads wd
2240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
2260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r6,r7                       @col == wd
2270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDREQB      r8,[r5]                     @pu1_avail[0]
2280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVNE       r8,#-1
2290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
2300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r6,#16                      @if(col == 16)
2320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SKIP_AU1_MASK_VAL
2330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r5,#1]                  @pu1_avail[1]
2340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
2350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_VAL:
2370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r5,#2]                  @pu1_avail[2]
2380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#0
2390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r4,[sp,#0xD4]               @Loads ht
2410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
2420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVNE       r8,r3
2440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
2450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0xD0]               @Loads wd
2470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
2480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r7,r7,r6                    @(wd - col)
2500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
2510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
2520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,#8
2530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r3,r3,#16
2540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
2560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
2570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
2580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r0,#8
2590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r7,r7,#15                   @15 + (wd - col)
2600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
2620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
2630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r5,#1
2640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_SRC_LEFT_LOOP:
2660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
2670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r4,r4,#1                    @decrement the loop count
2680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r8,[r5,#1]!                 @store it in the stack pointer
2690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         AU1_SRC_LEFT_LOOP
2700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I8     Q9,#0
2720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
2730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r0,r1                    @I *pu1_src + src_strd
2750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
2760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r7,r12                      @row count, move ht_tmp to r7
2770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r12,r7                   @I ht_tmp - row
2790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D16,[r8]!                   @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D17,[r8]                    @I pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
2810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,#8
2820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r14,r5                   @I pu1_src_left_cpy[ht_tmp - row]
2830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r8,#1                    @I pu1_src_left_cpy[ht_tmp - row + 1]
2850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r8]
2860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @I Loads pu1_avail
2880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D19[7],r8                   @I vsetq_lane_u8
2890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#2]                  @I pu1_avail[2]
2900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q9,Q9,Q8,#15                @I pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
2920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r5,#0                       @I
2930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SIGN_UP_CHANGE_DONE         @I
2940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE:
2960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r0,#15]                 @I pu1_src_cpy[15]
2970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r0,r1                    @I pu1_src_cpy[16 - src_strd]
2980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
2990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#16]                 @I load the value
3000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,r8,r5                    @I pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
3010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#0                       @I
3020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r8,#0                       @I
3030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r8,#1                       @I SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
3040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D15[7],r8                   @I sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
3050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_DONE:
3070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q5,Q6,Q9                    @I vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
3080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q9,Q6,Q9                    @I vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
3090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q5,Q9,Q5                    @I sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
3100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q9,Q0,Q7                    @I edge_idx = vaddq_s8(const_2, sign_up)
3120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q9,Q9,Q5                    @I edge_idx = vaddq_s8(edge_idx, sign_down)
3130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D18,{D6},D18                @I vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
3140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q7,Q5                       @I sign_up = vnegq_s8(sign_down)
3150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q7,Q7,#1                 @I sign_up = vextq_s8(sign_up, sign_up, 1)
3170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D19,{D6},D19                @I vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
3180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q10,D12                     @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
3200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q9,Q9,Q4                    @I edge_idx = vandq_s8(edge_idx, au1_mask)
3210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D10,{D7},D18                @I offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
3230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q11,D13                     @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
3250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q10,Q10,D10                 @I pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
3260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q10,Q10,Q1                  @I pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
3280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D11,{D7},D19                @I offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
3290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q10,Q10,Q2                  @I pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
3300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV        Q6,Q8
3320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q11,Q11,D11                 @I pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
3330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q11,Q11,Q1                  @I pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
3350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q11,Q11,Q2                  @I pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
3360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r7,r7,#1                    @I Decrement the ht_tmp loop count by 1
3380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP:
3400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r0,r1,LSL #1             @II *pu1_src + src_strd
3410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D20,Q10                     @I vmovn_s16(pi2_tmp_cur_row.val[0])
3420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r12,r7                   @II ht_tmp - row
3430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r4,r0,r1                    @II pu1_src_cpy[16 - src_strd]
3450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D21,Q11                     @I vmovn_s16(pi2_tmp_cur_row.val[1])
3460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r2,r8,r1                    @III *pu1_src + src_strd
3470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r4,#15]                @II pu1_src_cpy[15]
3490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D16,[r8]!                   @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
3500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D17,[r8]                    @II pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
3510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,#8
3520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r7,r7,#1                    @II Decrement the ht_tmp loop count by 1
3530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r14,r5                   @II pu1_src_left_cpy[ht_tmp - row]
3550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D30,[r2]!                   @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
3560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D31,[r2]                    @III pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
3570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r2,#8
3580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r8,#1]
3590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r4,[r0,#16]                 @II load the value
3610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D19[7],r8                   @II vsetq_lane_u8
3620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r11,r11,r4                  @II pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
3630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r11,#0                      @II
3650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q10},[r0],r1               @I vst1q_u8(pu1_src_cpy, pu1_cur_row)
3660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r12,r7                   @III ht_tmp - row
3670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r11,#0                      @II
3690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q9,Q9,Q8,#15                @II pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
3700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r11,#1                      @II SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
3710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r14,r5                   @III pu1_src_left_cpy[ht_tmp - row]
3730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D15[7],r11                  @II sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
3740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,#1                       @III
3750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         NEXT_ROW_ELSE_2             @III
3770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @III Loads pu1_avail
3780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#3]                  @III pu1_avail[3]
3790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r5,#0                       @III
3800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBNE       r8,r2,#2                    @III pu1_src_cpy[src_strd - 1]
3810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_ELSE_2:
3830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r8,#1]                  @III
3840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q12,Q6,Q9                   @II vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
3850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,r0,r1
3860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r2,[r5,#15]                 @III pu1_src_cpy[15]
3880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q13,Q6,Q9                   @II vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
3890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r0,#16]                 @III load the value
3900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r2,r2,r5                    @III pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
3920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q12,Q13,Q12                 @II sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
3930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r2,#0                       @III
3940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r2,#0                       @III
3960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D19[7],r8                   @III vsetq_lane_u8
3970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r2,#1                       @III SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
3980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
3990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r7,r7,#1                    @III Decrement the ht_tmp loop count by 1
4000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q13,Q0,Q7                   @II edge_idx = vaddq_s8(const_2, sign_up)
4010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q7,Q12                      @II sign_up = vnegq_s8(sign_down)
4030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q9,Q9,Q15,#15               @III pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
4040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q13,Q13,Q12                 @II edge_idx = vaddq_s8(edge_idx, sign_down)
4060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q7,Q7,#1                 @II sign_up = vextq_s8(sign_up, sign_up, 1)
4080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D26,{D6},D26                @II vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
4090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q5,Q8,Q9                    @III vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
4100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D15[7],r2                   @III sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
4120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D27,{D6},D27                @II vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
4130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q9,Q8,Q9                    @III vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
4140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q14,D12                     @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
4160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q13,Q13,Q4                  @II edge_idx = vandq_s8(edge_idx, au1_mask)
4170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q5,Q9,Q5                    @III sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
4190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D7},D26                @II offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
4200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q9,Q0,Q7                    @III edge_idx = vaddq_s8(const_2, sign_up)
4210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q9,Q9,Q5                    @III edge_idx = vaddq_s8(edge_idx, sign_down)
4230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D25,{D7},D27                @II offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
4240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q7,Q5                       @III sign_up = vnegq_s8(sign_down)
4250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q14,Q14,D24                 @II pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
4270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D18,{D6},D18                @III vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
4280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q14,Q14,Q1                  @II pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
4290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q7,Q7,#1                 @III sign_up = vextq_s8(sign_up, sign_up, 1)
4310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D19,{D6},D19                @III vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
4320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q14,Q14,Q2                  @II pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
4330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q13,D13                     @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
4350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q9,Q9,Q4                    @III edge_idx = vandq_s8(edge_idx, au1_mask)
4360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q13,Q13,D25                 @II pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
4380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D10,{D7},D18                @III offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
4390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q13,Q13,Q1                  @II pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
4400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q10,D16                     @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
4420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q13,Q13,Q2                  @II pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
4430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q10,Q10,D10                 @III pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
4450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D11,{D7},D19                @III offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
4460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q10,Q10,Q1                  @III pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
4470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q11,D17                     @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
4490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q10,Q10,Q2                  @III pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
4500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D28,Q14                     @II vmovn_s16(pi2_tmp_cur_row.val[0])
4520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q11,Q11,D11                 @III pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
4530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D29,Q13                     @II vmovn_s16(pi2_tmp_cur_row.val[1])
4550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q11,Q11,Q1                  @III pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
4560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV        Q6,Q15                      @II pu1_cur_row = pu1_next_row
4580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q11,Q11,Q2                  @III pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
4590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,#1                       @III
4610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q14},[r0],r1               @II vst1q_u8(pu1_src_cpy, pu1_cur_row)
4620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         PU1_SRC_LOOP                @If not equal jump to PU1_SRC_LOOP
4630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         INNER_LOOP_DONE
4640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r0,r1,LSL #1             @*pu1_src + src_strd
4660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D20,Q10                     @III vmovn_s16(pi2_tmp_cur_row.val[0])
4670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
4680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#3]                  @pu1_avail[3]
4700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D21,Q11                     @III vmovn_s16(pi2_tmp_cur_row.val[1])
4710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r5,#0
4720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r4,r0,r1                    @pu1_src_cpy[16 - src_strd]
4740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
4750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
4760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,#8
4770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r0,#16]                 @load the value
4780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         NEXT_ROW_ELSE_3
4800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
4810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    B           NEXT_ROW_POINTER_ASSIGNED_3
4820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_ELSE_3:
4830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r11,r12,r7                  @ht_tmp - row
4840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r14,r11                  @pu1_src_left_cpy[ht_tmp - row]
4850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
4860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r8]
4870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_POINTER_ASSIGNED_3:
4890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r11,[r4,#15]                @pu1_src_cpy[15]
4900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D19[7],r8                   @vsetq_lane_u8
4910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,r11,r5                   @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
4920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#0
4940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
4950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r8,#0
4960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
4970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q10},[r0],r1               @III vst1q_u8(pu1_src_cpy, pu1_cur_row)
4980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q12,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
4990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
5010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q13,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
5020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
5040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q12,Q13,Q12                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
5050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q10,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
5070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
5080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q11,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
5100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
5110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
5130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
5140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
5160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
5180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q10,Q10,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
5200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
5210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q10,Q10,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
5220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q10,Q10,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
5240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q11,Q11,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
5260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q11,Q11,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
5270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q11,Q11,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
5280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarINNER_LOOP_DONE:
5300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D20,Q10                     @vmovn_s16(pi2_tmp_cur_row.val[0])
5310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r8,[sp,#0xD4]               @Loads ht
5320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D21,Q11                     @vmovn_s16(pi2_tmp_cur_row.val[1])
5340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
5350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q10},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
5370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
5380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP:
5390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
5400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r8,r8,#4
5410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
5420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_LEFT_LOOP
5430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
5450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r6,#8                       @Check whether residue remains
5460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLT         RE_ASSINING_LOOP            @Jump to re-assigning loop
5470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0xD0]               @Loads wd
5480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r0,[sp,#0x90]               @Loads *pu1_src
5490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r7,r7,r6
5500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r0,r0,r7
5510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         WIDTH_LOOP_16               @If not equal jump to width_loop
5520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         WIDTH_RESIDUE               @If residue remains jump to residue loop
5530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWD_16_HT_4_LOOP:
5570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
5580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0xD0]               @Loads wd
5590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r6,r7                       @col == wd
5600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDREQB      r8,[r5]                     @pu1_avail[0]
5610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVNE       r8,#-1
5620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
5630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r6,#16                      @if(col == 16)
5650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SKIP_AU1_MASK_VAL_WD_16_HT_4
5660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r5,#1]                  @pu1_avail[1]
5670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      d9[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
5680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSKIP_AU1_MASK_VAL_WD_16_HT_4:
5700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r5,#2]                  @pu1_avail[2]
5710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#0
5720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
5740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVNE       r8,r3
5750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
5760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
5770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
5780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,#8
5790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r3,r3,#16
5810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
5820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r4,[sp,#0xD4]               @Loads ht
5830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0xD0]               @Loads wd
5840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r7,r7,r6                    @(wd - col)
5850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r7,r7,#15                   @15 + (wd - col)
5860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
5870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + 15 + (wd - col)]
5880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r5,#1
5890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_SRC_LEFT_LOOP_WD_16_HT_4:
5910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
5920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r8,[r5,#1]!                 @store it in the stack pointer
5930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r4,r4,#1                    @decrement the loop count
5940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         AU1_SRC_LEFT_LOOP_WD_16_HT_4
5950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
5960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
5970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
5980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r0,#8
5990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
6010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
6020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
6030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I8     Q9,#0
6040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r7,r12                      @row count, move ht_tmp to r7
6050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_WD_16_HT_4:
6070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r0,r1                    @*pu1_src + src_strd
6080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
6090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
6100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,#8
6110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
6120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#3]                  @pu1_avail[3]
6130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r5,#0
6140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         NEXT_ROW_ELSE_WD_16_HT_4
6150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,#1
6160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDREQB      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
6170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         NEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4
6180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_ELSE_WD_16_HT_4:
6190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r12,r7                   @ht_tmp - row
6200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
6210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
6220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r8]
6230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_POINTER_ASSIGNED_WD_16_HT_4:
6250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D19[7],r8                   @vsetq_lane_u8
6260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
6270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,r12
6290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SIGN_UP_CHANGE_WD_16_HT_4
6300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
6310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#2]                  @pu1_avail[2]
6320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r5,#0
6330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SIGN_UP_CHANGE_DONE_WD_16_HT_4
6340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_WD_16_HT_4:
6360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
6370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,r0,#16                   @pu1_src_cpy[16]
6380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
6390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5]                     @load the value
6400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
6410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#0
6420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r8,#0
6430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
6440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
6450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_DONE_WD_16_HT_4:
6470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
6480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
6490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
6500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
6520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
6530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
6540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
6550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
6570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
6590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
6600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
6620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
6630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
6640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
6650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
6660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D25,{D7},D27                @offset = vtbl1_s8(offset_tbl, vget_high_s8(edge_idx))
6680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q15,D13                     @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pu1_cur_row)))
6690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q15,Q15,D25                 @pi2_tmp_cur_row.val[1] = vaddw_s8(pi2_tmp_cur_row.val[1], offset)
6700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q15,Q15,Q1                  @pi2_tmp_cur_row.val[1] = vmaxq_s16(pi2_tmp_cur_row.val[1], const_min_clip)
6710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q15,Q15,Q2                  @pi2_tmp_cur_row.val[1] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[1]), const_max_clip))
6720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D28,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
6740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D29,Q15                     @vmovn_s16(pi2_tmp_cur_row.val[1])
6750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {Q14},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
6770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
6790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r7,r7,#1                    @Decrement the ht_tmp loop count by 1
6800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         PU1_SRC_LOOP_WD_16_HT_4     @If not equal jump to PU1_SRC_LOOP_WD_16_HT_4
6810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r8,[sp,#0xD4]               @Loads ht
6830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
6840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
6850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP_WD_16_HT_4:
6860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
6870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
6880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r8,r8,#4
6890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_LEFT_LOOP_WD_16_HT_4
6900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r6,r6,#16                   @Decrement the wd loop count by 16
6920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BLE         RE_ASSINING_LOOP            @Jump to re-assigning loop
6930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BGT         WD_16_HT_4_LOOP             @If not equal jump to width_loop
6940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
6960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarWIDTH_RESIDUE:
6970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0xD0]               @Loads wd
6980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
6990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r6,r7                       @wd_residue == wd
7000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDREQB      r8,[r5]                     @pu1_avail[0]
7010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVNE       r8,#-1
7030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      d8[0],r8                    @au1_mask = vsetq_lane_s8(-1, au1_mask, 0)
7040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r5,#1]                  @pu1_avail[1]
7060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      d8[7],r8                    @au1_mask = vsetq_lane_s8(pu1_avail[1], au1_mask, 15)
7070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_AVAIL_2_RESIDUE:
7090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r5,#2]                  @pu1_avail[2]
7100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#0
7110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBEQ       r8,r0,r1                    @pu1_src - src_strd
7130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVNE       r8,r3
7140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r8,#1                    @pu1_src - src_strd + 1
7150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D10,[r8]!                   @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
7160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D11,[r8]                    @pu1_top_row = vld1q_u8(pu1_src - src_strd + 1)
7170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,#8
7180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
7210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r4,[sp,#0xD4]               @Loads ht
7220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0xD0]               @Loads wd
7230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r8,[sp,#0xC0]               @Loads *pu1_src
7240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r7,r7,#1                    @(wd - 1)
7250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r7,r8,r7                    @pu1_src[0 * src_strd + (wd - 1)]
7260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r5,#1
7270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarAU1_SRC_LEFT_LOOP_RESIDUE:
7290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r7],r1                  @load the value and increment by src_strd
7300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r8,[r5,#1]!                 @store it in the stack pointer
7310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r4,r4,#1                    @decrement the loop count
7320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         AU1_SRC_LEFT_LOOP_RESIDUE
7330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D12,[r0]!                   @pu1_cur_row = vld1q_u8(pu1_src)
7350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D13,[r0]                    @pu1_cur_row = vld1q_u8(pu1_src)
7360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r0,#8
7370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q7,Q6,Q5                    @vcgtq_u8(pu1_cur_row, pu1_top_row)
7390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q8,Q6,Q5                    @vcltq_u8(pu1_cur_row, pu1_top_row)
7400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q7,Q8,Q7                    @sign_up = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
7410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOV         r7,r12                      @row count, move ht_tmp to r7
7420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarPU1_SRC_LOOP_RESIDUE:
7440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.I8     Q9,#0
7450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r0,r1                    @*pu1_src + src_strd
7460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D16,[r8]!                   @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
7470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D17,[r8]                    @pu1_next_row = vld1q_u8(pu1_src_cpy + src_strd)
7480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,#8
7490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
7500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#3]                  @pu1_avail[3]
7510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r5,#0
7520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         NEXT_ROW_ELSE_RESIDUE
7530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,#1
7540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDREQB      r8,[r8,#-1]                 @pu1_src_cpy[src_strd - 1]
7550d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BEQ         NEXT_ROW_POINTER_ASSIGNED_RESIDUE
7560d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_ELSE_RESIDUE:
7570d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r12,r7                   @ht_tmp - row
7580d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r14,r5                   @pu1_src_left_cpy[ht_tmp - row]
7590d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r8,#1                    @pu1_src_left_cpy[ht_tmp - row + 1]
7600d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r8]
7610d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7620d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarNEXT_ROW_POINTER_ASSIGNED_RESIDUE:
7630d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D19[7],r8                   @vsetq_lane_u8
7640d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q9,Q9,Q8,#15                @pu1_next_row_tmp = vextq_u8(pu1_next_row_tmp, pu1_next_row, 15)
7650d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7660d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r7,r12
7670d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SIGN_UP_CHANGE_RESIDUE
7680d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r5,[sp,#0xC8]               @Loads pu1_avail
7690d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5,#2]                  @pu1_avail[2]
7700d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r5,#0
7710d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SIGN_UP_CHANGE_DONE_RESIDUE
7720d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7730d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_RESIDUE:
7740d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[r0,#15]                 @pu1_src_cpy[15]
7750d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,r0,#16                   @pu1_src_cpy[16]
7760d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r5,r5,r1                    @pu1_src_cpy[16 - src_strd]
7770d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r5,[r5]                     @load the value
7780d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r8,r8,r5                    @pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]
7790d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    CMP         r8,#0
7800d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MVNLT       r8,#0
7810d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MOVGT       r8,#1                       @SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd])
7820d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV.8      D15[7],r8                   @sign_up = vsetq_lane_s8(SIGN(pu1_src_cpy[15] - pu1_src_cpy[16 - src_strd]), sign_up, 15)
7830d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7840d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSIGN_UP_CHANGE_DONE_RESIDUE:
7850d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCGT.U8     Q10,Q6,Q9                   @vcgtq_u8(pu1_cur_row, pu1_next_row_tmp)
7860d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VCLT.U8     Q11,Q6,Q9                   @vcltq_u8(pu1_cur_row, pu1_next_row_tmp)
7870d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VSUB.U8     Q12,Q11,Q10                 @sign_down = vreinterpretq_s8_u8(vsubq_u8(cmp_lt, cmp_gt))
7880d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7890d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q13,Q0,Q7                   @edge_idx = vaddq_s8(const_2, sign_up)
7900d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADD.I8     Q13,Q13,Q12                 @edge_idx = vaddq_s8(edge_idx, sign_down)
7910d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D26,{D6},D26                @vtbl1_s8(edge_idx_tbl, vget_low_s8(edge_idx))
7920d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D27,{D6},D27                @vtbl1_s8(edge_idx_tbl, vget_high_s8(edge_idx))
7930d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7940d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VAND        Q13,Q13,Q4                  @edge_idx = vandq_s8(edge_idx, au1_mask)
7950d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7960d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VNEG.S8     Q7,Q12                      @sign_up = vnegq_s8(sign_down)
7970d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VEXT.8      Q7,Q7,Q7,#1                 @sign_up = vextq_s8(sign_up, sign_up, 1)
7980d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
7990d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VTBL.8      D24,{D7},D26                @offset = vtbl1_s8(offset_tbl, vget_low_s8(edge_idx))
8000d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVL.U8    Q14,D12                     @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pu1_cur_row)))
8010d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VADDW.S8    Q14,Q14,D24                 @pi2_tmp_cur_row.val[0] = vaddw_s8(pi2_tmp_cur_row.val[0], offset)
8020d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMAX.S16    Q14,Q14,Q1                  @pi2_tmp_cur_row.val[0] = vmaxq_s16(pi2_tmp_cur_row.val[0], const_min_clip)
8030d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMIN.U16    Q14,Q14,Q2                  @pi2_tmp_cur_row.val[0] = vreinterpretq_s16_u16(vminq_u16(vreinterpretq_u16_s16(pi2_tmp_cur_row.val[0]), const_max_clip))
8040d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8050d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOVN.I16   D30,Q14                     @vmovn_s16(pi2_tmp_cur_row.val[0])
8060d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8070d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      {D30},[r0],r1               @vst1q_u8(pu1_src_cpy, pu1_cur_row)
8080d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VMOV        Q6,Q8                       @pu1_cur_row = pu1_next_row
8090d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r7,r7,#1
8100d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         PU1_SRC_LOOP_RESIDUE
8110d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8120d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r8,[sp,#0xD4]               @Loads ht
8130d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r2,[sp,#0xC4]               @Loads *pu1_src_left
8140d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r5,sp,#0x42                 @*au1_src_left_tmp
8150d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8160d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_LEFT_LOOP_RESIDUE:
8170d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[r5],#4                  @au1_src_left_tmp[row]
8180d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r8,r8,#4
8190d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STR         r7,[r2],#4                  @pu1_src_left[row] = au1_src_left_tmp[row]
8200d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_LEFT_LOOP_RESIDUE
8210d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8220d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8230d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarRE_ASSINING_LOOP:
8240d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r7,[sp,#0xD0]               @Loads wd
8250d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r0,[sp,#0xC0]               @Loads *pu1_src
8260d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8270d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r11,[sp,#0xD4]              @Loads ht
8280d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r8,r0,r7                    @pu1_src[wd]
8290d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8300d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r4,[sp,#0xBC]               @Loads pu1_src_top_left
8310d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUB         r11,r11,#1                  @ht - 1
8320d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8330d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r9,[r8,#-1]                 @pu1_src_org[wd - 1] = u1_pos_wd_0_tmp
8340d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    MLA         r6,r11,r1,r0                @pu1_src_org[(ht - 1) * src_strd]
8350d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8360d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDRB        r8,[sp]                     @load u1_src_top_left_tmp from stack pointer
8370d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         r12,sp,#0x02
8380d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8390d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r10,[r6]                    @pu1_src_org[wd - 1 + (ht - 1) * src_strd] = u1_pos_wd_ht_tmp
8400d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    STRB        r8,[r4]                     @*pu1_src_top_left = u1_src_top_left_tmp
8410d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDR         r3,[sp,#0xCC]               @Loads pu1_src_top
8420d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8430d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarSRC_TOP_LOOP:
8440d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VLD1.8      D0,[r12]!                   @pu1_src_top[col] = au1_src_top_tmp[col]
8450d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    SUBS        r7,r7,#8                    @Decrement the width
8460d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    VST1.8      D0,[r3]!                    @pu1_src_top[col] = au1_src_top_tmp[col]
8470d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    BNE         SRC_TOP_LOOP
8480d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8490d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish MahendrakarEND_LOOPS:
8500d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    ADD         sp,sp,#0x94
8510d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar    LDMFD       sp!,{r4-r12,r15}            @Reload the registers from SP
8520d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8530d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
8540d8951cef4b1a1dbf4ff5ba3e8796cf1d4503098Harish Mahendrakar
855