common/armv8/ideint_spatial_filter_av8.s

//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/

//******************************************************************************
//*
//* @brief
//*  This file contains definitions of routines for spatial filter
//*
//* @author
//*  Ittiam
//*
//* @par List of Functions:
//*  - ideint_spatial_filter_a9()
//*
//* @remarks
//*  None
//*
//*******************************************************************************


//******************************************************************************
//*
//*  @brief Performs spatial filtering
//*
//*  @par   Description
//*   This functions performs edge adaptive spatial filtering on a 8x8 block
//*
//* @param[in] pu1_src
//*  UWORD8 pointer to the source
//*
//* @param[in] pu1_out
//*  UWORD8 pointer to the destination
//*
//* @param[in] src_strd
//*  source stride
//*
//* @param[in] src_strd
//*  destination stride
//*
//* @returns
//*     None
//*
//* @remarks
//*
//******************************************************************************

    .global ideint_spatial_filter_av8

ideint_spatial_filter_av8:

    movi  v16.8h, #0
    movi  v18.8h, #0
    movi  v20.8h, #0

    // Backup x0
    mov     x10,    x0

    // Load from &pu1_row_1[0]
    sub     x5,         x0,         #1
    ld1     {v0.8b},    [x0],       x2

    // Load from &pu1_row_1[-1]
    ld1     {v1.8b},    [x5]
    add     x5,         x5,        #2

    // Load from &pu1_row_1[1]
    ld1     {v2.8b},    [x5]

    // Number of rows
    mov     x4,         #4

    // EDGE_BIAS_0
    movi    v30.2s,     #5

    // EDGE_BIAS_1
    movi    v31.2s,     #7

detect_edge:
    // Load from &pu1_row_2[0]
    sub     x5,         x0,         #1
    ld1     {v3.8b},    [x0],       x2

    // Load from &pu1_row_2[-1]
    ld1     {v4.8b},    [x5]
    add     x5,         x5,         #2

    // Load from &pu1_row_2[1]
    ld1     {v5.8b},    [x5]

    // Calculate absolute differences
    // pu1_row_1[i] - pu1_row_2[i]
    uabal   v16.8h,      v0.8b,        v3.8b

    // pu1_row_1[i - 1] - pu1_row_2[i + 1]
    uabal   v18.8h,      v1.8b,        v5.8b

    // pu1_row_1[i + 1] - pu1_row_2[i - 1]
    uabal   v20.8h,      v2.8b,        v4.8b

    mov     v0.8b,      v3.8b
    mov     v1.8b,      v4.8b
    mov     v2.8b,      v5.8b

    subs    x4,         x4,             #1
    bgt            detect_edge

    // Calculate sum of absolute differeces for each edge
    addp  v16.8h,       v16.8h,         v16.8h
    addp  v18.8h,       v18.8h,         v18.8h
    addp  v20.8h,       v20.8h,         v20.8h

    uaddlp  v16.2s,     v16.4h
    uaddlp  v18.2s,     v18.4h
    uaddlp  v20.2s,     v20.4h

    // adiff[0] *= EDGE_BIAS_0;
    mul     v16.2s,     v16.2s,         v30.2s

    // adiff[1] *= EDGE_BIAS_1;
    mul     v18.2s,     v18.2s,         v31.2s

    // adiff[2] *= EDGE_BIAS_1;
    mul     v20.2s,     v20.2s,         v31.2s

    // Move the differences to ARM registers


    // Compute shift for first half of the block
compute_shift_1:
    smov    x5,         v16.s[0]
    smov    x6,         v18.s[0]
    smov    x7,         v20.s[0]

    // Compute shift
    mov     x8,         #0

    // adiff[2] <= adiff[1]
    cmp     x7,         x6
    bgt     dir_45_gt_135_1

    // adiff[2] <= adiff[0]
    cmp     x7,         x5
    mov     x11,        #1
    csel    x8,         x11,        x8,     le

    b       compute_shift_2
dir_45_gt_135_1:

    // adiff[1] <= adiff[0]
    cmp     x6,         x5
    // Move -1 if less than or equal to
    movn    x11,        #0
    csel    x8,         x11,        x8,     le


compute_shift_2:
    // Compute shift for first half of the block
    smov    x5,         v16.s[1]
    smov    x6,         v18.s[1]
    smov    x7,         v20.s[1]

    // Compute shift
    mov     x9,         #0

    // adiff[2] <= adiff[1]
    cmp     x7,         x6
    bgt     dir_45_gt_135_2

    // adiff[2] <= adiff[0]
    cmp     x7,         x5
    mov     x11,        #1
    csel    x9,         x11,        x9,     le

    b       interpolate

dir_45_gt_135_2:
    // adiff[1] <= adiff[0]
    cmp     x6,         x5

    // Move -1 if less than or equal to
    movn    x11,        #0
    csel    x9,         x11,        x9,     le

interpolate:
    add     x4,         x10,        x8
    add     x5,         x10,        x2
    sub     x5,         x5,         x8

    add     x10,        x10,        #4
    add     x6,         x10,        x9
    add     x7,         x10,        x2
    sub     x7,         x7,         x9
    mov     x8,         #4

filter_loop:
    ld1     {v0.s}[0],  [x4],       x2
    ld1     {v2.s}[0],  [x5],       x2

    ld1     {v0.s}[1],  [x6],       x2
    ld1     {v2.s}[1],  [x7],       x2

    urhadd  v4.8b,      v0.8b,      v2.8b
    st1     {v4.2s},    [x1],       x3

    subs    x8,         x8,         #1
    bgt     filter_loop

    ret