; Copyright (C) 2009 The Android Open Source Project
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;      http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.

;-------------------------------------------------------------------------------
;--
;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorQuarter function
;--
;-------------------------------------------------------------------------------


    IF :DEF: H264DEC_WINASM
        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
    ELSE
        REQUIRE8
        PRESERVE8
    ENDIF

    AREA    |.text|, CODE

;// h264bsdInterpolateHorQuarter register allocation

ref     RN 0

mb      RN 1
buff    RN 1

count   RN 2
x0      RN 2

y0      RN 3
x_2_0   RN 3

width   RN 4
x_3_1   RN 4

height  RN 5
x_6_4   RN 5

partW   RN 6
x_7_5   RN 6

partH   RN 7
tmp1    RN 7

tmp2    RN 8

tmp3    RN 9

tmp4    RN 10

mult_20_01  RN 11

mult_20_m5  RN 12

plus16  RN 14


;// function exports and imports

    IMPORT  h264bsdFillBlock

    EXPORT  h264bsdInterpolateHorQuarter


;// Horizontal filter approach
;//
;// Basic idea in horizontal filtering is to adjust coefficients
;// like below. Calculation is done with 16-bit maths.
;//
;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
;// y_0 =   20  1     20 -5        -5         1
;// y_1 =   -5        20  1      1 20        -5
;// y_2 =    1        -5        -5 20      1 20
;// y_3 =              1        20 -5     -5 20         1


h264bsdInterpolateHorQuarter
    STMFD   sp!, {r0-r11, lr}
    SUB     sp, sp, #0x1e4

    CMP     x0, #0
    BLT     do_fill                 ;// (x0 < 0)
    LDR     partW, [sp,#0x220]      ;// partWidth
    ADD     tmp4, x0, partW         ;// (x0+partWidth)
    ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
    LDR     width, [sp,#0x218]      ;// width
    CMP     tmp4, width
    BHI     do_fill                 ;// (x0+partW)>width

    CMP     y0, #0
    BLT     do_fill                 ;// (y0 < 0)
    LDR     partH, [sp,#0x224]      ;// partHeight
    ADD     tmp2, y0, partH         ;// (y0+partHeight)
    LDR     height, [sp,#0x21c]     ;// height
    CMP     tmp2, height
    BLS     skip_fill               ;// no overfill needed


do_fill
    LDR     partH, [sp,#0x224]      ;// partHeight
    LDR     height, [sp,#0x21c]     ;// height
    LDR     partW, [sp,#0x220]      ;// partWidth
    ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
    STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
    STR     partH, [sp,#0xc]        ;// sp+c = partHeight
    STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
    LDR     width, [sp,#0x218]      ;// width
    STR     width, [sp,#0]          ;// sp+0 = width
    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
    BL      h264bsdFillBlock

    MOV     x0, #0
    STR     x0,[sp,#0x1ec]          ;// x0 = 0
    STR     x0,[sp,#0x1f0]          ;// y0 = 0
    ADD     ref,sp,#0x28            ;// ref = p1
    STR     tmp4, [sp,#0x218]       ;// width = partWidth+5


skip_fill
    LDR     x0 ,[sp,#0x1ec]         ;// x0
    LDR     y0 ,[sp,#0x1f0]         ;// y0
    LDR     width, [sp,#0x218]      ;// width
    MLA     tmp2, width, y0, x0     ;// y0*width+x0
    ADD     ref, ref, tmp2          ;// ref += y0*width+x0
    ADD     ref, ref, #8            ;// ref = ref+8
    LDR     mb, [sp, #0x1e8]        ;// mb

    ;// pack values to count register
    ;// [31:28] loop_x (partWidth-1)
    ;// [27:24] loop_y (partHeight-1)
    ;// [23:20] partWidth-1
    ;// [19:16] partHeight-1
    ;// [15:00] width
    MOV     count, width
    SUB     partW, partW, #1;
    SUB     partH, partH, #1;
    ADD     tmp2, partH, partW, LSL #4
    ADD     count, count, tmp2, LSL #16


    LDR     mult_20_01, = 0x00140001
    LDR     mult_20_m5, = 0x0014FFFB
    MOV     plus16, #16
    AND     tmp1, count, #0x000F0000    ;// partHeight-1
    AND     tmp3, count, #0x00F00000    ;// partWidth-1
    ADD     count, count, tmp1, LSL #8
loop_y
    LDR     x_3_1, [ref, #-8]
    ADD     count, count, tmp3, LSL #8
    LDR     x_7_5, [ref, #-4]
    UXTB16  x_2_0, x_3_1
    UXTB16  x_3_1, x_3_1, ROR #8
    UXTB16  x_6_4, x_7_5

loop_x
    UXTB16  x_7_5, x_7_5, ROR #8

    SMLAD   tmp1, x_2_0, mult_20_01, plus16
    SMLATB  tmp3, x_2_0, mult_20_01, plus16
    SMLATB  tmp2, x_2_0, mult_20_m5, plus16
    SMLATB  tmp4, x_3_1, mult_20_01, plus16

    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
    LDR     x_3_1, [ref], #4
    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4

    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4

    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
    UXTB16  x_2_0, x_3_1
    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
    SMLABB  tmp4, x_2_0, mult_20_01, tmp4

    MOV     tmp2, tmp2, ASR #5
    MOV     tmp1, tmp1, ASR #5
    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
    LDR     tmp4, [sp, #0x228]
    USAT16  tmp2, #8, tmp2
    USAT16  tmp1, #8, tmp1
    SUB     tmp4, tmp4, #10

    SUBS    count, count, #4<<28
    LDR     tmp3, [ref, tmp4]
    ORR     tmp1, tmp1, tmp2, LSL #8

;// quarter pel position
    LDR     tmp2, = 0x80808080
    MVN     tmp3, tmp3
    UHSUB8  tmp1, tmp1, tmp3
    EOR     tmp1, tmp1, tmp2
    STR     tmp1, [mb], #4

    BCC     next_y

    UXTB16  x_3_1, x_3_1, ROR #8

    SMLAD   tmp1, x_6_4, mult_20_01, plus16
    SMLATB  tmp3, x_6_4, mult_20_01, plus16
    SMLATB  tmp2, x_6_4, mult_20_m5, plus16
    SMLATB  tmp4, x_7_5, mult_20_01, plus16

    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
    LDR     x_7_5, [ref], #4
    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4

    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4

    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
    UXTB16  x_6_4, x_7_5
    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
    SMLABB  tmp4, x_6_4, mult_20_01, tmp4

    MOV     tmp2, tmp2, ASR #5
    MOV     tmp1, tmp1, ASR #5
    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
    LDR     tmp4, [sp, #0x228]
    USAT16  tmp2, #8, tmp2
    USAT16  tmp1, #8, tmp1
    SUB     tmp4, tmp4, #10

    SUBS    count, count, #4<<28
    LDR     tmp3, [ref, tmp4]
    ORR     tmp1, tmp1, tmp2, LSL #8

;// quarter pel
    LDR     tmp2, = 0x80808080
    MVN     tmp3, tmp3
    UHSUB8  tmp1, tmp1, tmp3
    EOR     tmp1, tmp1, tmp2

    STR     tmp1, [mb], #4
    BCS     loop_x

next_y
    AND     tmp3, count, #0x00F00000    ;// partWidth-1
    SMLABB  ref, count, mult_20_01, ref ;// +width
    ADDS    mb, mb, #16                 ;// +16, Carry=0
    SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
    SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
    ADDS    count, count, #(1<<28)-(1<<24)
    BGE     loop_y

    ADD     sp,sp,#0x1f4
    LDMFD   sp!, {r4-r11, pc}

    END