common/arm/ideint_spatial_filter_a9.s

@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/

@******************************************************************************
@*
@* @brief
@*  This file contains definitions of routines for spatial filter
@*
@* @author
@*  Ittiam
@*
@* @par List of Functions:
@*  - ideint_spatial_filter_a9()
@*
@* @remarks
@*  None
@*
@*******************************************************************************


@******************************************************************************
@*
@*  @brief Performs spatial filtering
@*
@*  @par   Description
@*   This functions performs edge adaptive spatial filtering on a 8x8 block
@*
@* @param[in] pu1_src
@*  UWORD8 pointer to the source
@*
@* @param[in] pu1_out
@*  UWORD8 pointer to the destination
@*
@* @param[in] src_strd
@*  source stride
@*
@* @param[in] src_strd
@*  destination stride
@*
@* @returns
@*  None
@*
@* @remarks
@*
@******************************************************************************

    .global ideint_spatial_filter_a9

ideint_spatial_filter_a9:

    stmfd       sp!,    {r4-r10, lr}

    vmov.u16    q8,     #0
    vmov.u16    q9,     #0
    vmov.u16    q10,    #0

    @ Backup r0
    mov         r10,    r0

    @ Load from &pu1_row_1[0]
    sub         r5,     r0,     #1
    vld1.8      d0,     [r0],   r2

    @ Load from &pu1_row_1[-1]
    vld1.8      d1,     [r5]
    add         r5,     r5,     #2

    @ Load from &pu1_row_1[1]
    vld1.8      d2,     [r5]

    @ Number of rows
    mov         r4,     #4

    @ EDGE_BIAS_0
    vmov.u32    d30,    #5

    @ EDGE_BIAS_1
    vmov.u32    d31,    #7

detect_edge:
    @ Load from &pu1_row_2[0]
    sub         r5,     r0,     #1
    vld1.8      d3,     [r0],   r2

    @ Load from &pu1_row_2[-1]
    vld1.8      d4,     [r5]
    add         r5,     r5,     #2

    @ Load from &pu1_row_2[1]
    vld1.8      d5,     [r5]

    @ Calculate absolute differences
    @ pu1_row_1[i] - pu1_row_2[i]
    vabal.u8    q8,     d0,     d3

    @ pu1_row_1[i - 1] - pu1_row_2[i + 1]
    vabal.u8    q9,     d1,     d5

    @ pu1_row_1[i + 1] - pu1_row_2[i - 1]
    vabal.u8    q10,    d4,     d2

    vmov        d0,     d3
    vmov        d1,     d4
    vmov        d2,     d5

    subs        r4,     r4,     #1
    bgt         detect_edge

    @ Calculate sum of absolute differeces for each edge
    vpadd.u16   d16,    d16,    d17
    vpadd.u16   d18,    d18,    d19
    vpadd.u16   d20,    d20,    d21

    vpaddl.u16  d16,    d16
    vpaddl.u16  d18,    d18
    vpaddl.u16  d20,    d20

    @ adiff[0] *= EDGE_BIAS_0;
    vmul.u32    d16,    d16,    d30

    @ adiff[1] *= EDGE_BIAS_1;
    vmul.u32    d18,    d18,    d31

    @ adiff[2] *= EDGE_BIAS_1;
    vmul.u32    d20,    d20,    d31

    @ Move the differences to ARM registers


    @ Compute shift for first half of the block
compute_shift_1:
    vmov.u32    r5,     d16[0]
    vmov.u32    r6,     d18[0]
    vmov.u32    r7,     d20[0]

    @ Compute shift
    mov         r8,     #0

    @ adiff[2] <= adiff[1]
    cmp         r7,     r6
    bgt         dir_45_gt_135_1

    @ adiff[2] <= adiff[0]
    cmp         r7,     r5
    movle       r8,     #1

    b           compute_shift_2
dir_45_gt_135_1:

    @ adiff[1] <= adiff[0]
    cmp         r6,     r5
    @ Move -1 if less than or equal to
    mvnle       r8,     #0


compute_shift_2:
    @ Compute shift for first half of the block
    vmov.u32    r5,     d16[1]
    vmov.u32    r6,     d18[1]
    vmov.u32    r7,     d20[1]

    @ Compute shift
    mov         r9,     #0

    @ adiff[2] <= adiff[1]
    cmp         r7,     r6
    bgt         dir_45_gt_135_2

    @ adiff[2] <= adiff[0]
    cmp         r7,     r5
    movle       r9,     #1

    b           interpolate
dir_45_gt_135_2:

    @ adiff[1] <= adiff[0]
    cmp         r6,     r5

    @ Move -1 if less than or equal to
    mvnle       r9,     #0

interpolate:
    add         r4,     r10,    r8
    add         r5,     r10,    r2
    sub         r5,     r5,     r8

    add         r10,    r10,    #4
    add         r6,     r10,    r9
    add         r7,     r10,    r2
    sub         r7,     r7,     r9
    mov         r8,     #4

filter_loop:
    vld1.u32    d0[0],  [r4],   r2
    vld1.u32    d2[0],  [r5],   r2

    vld1.u32    d0[1],  [r6],   r2
    vld1.u32    d2[1],  [r7],   r2

    vrhadd.u8   d4,     d0,     d2
    vst1.u32    d4,     [r1],   r3

    subs        r8,     #1
    bgt         filter_loop

    ldmfd       sp!,    {r4-r10, pc}