m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s

;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;//      http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   9641
;// Date:       Thursday, February 7, 2008
;//
;//
;//
;//

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h

        M_VARIANTS ARM1136JS


    IF  ARM1136JS

MASK_1  EQU 0x01010101

;// Declare input registers

pQ0        RN 0
StepArg    RN 1
tC0Arg     RN 2
alpha      RN 6

beta       RN 14
bS         RN 14
tC0        RN 14
ptC0       RN 1

;// Declare Local/Temporary variables

;// Pixels
p_0     RN 3
p_1     RN 5
p_2     RN 4
p_3     RN 2
q_0     RN 8
q_1     RN 9
q_2     RN 10
q_3     RN 12


;// Filtering

ap0q0   RN 1
filt    RN 2

m00     RN 7
m01     RN 11

apflg   RN 0
aqflg   RN 6

tC      RN 1


;//Declarations for bSLT4 kernel

pos     RN 7
neg     RN 12

P0a     RN 1
P1a     RN 8
Q0a     RN 7
Q1a     RN 4

u1      RN 3
max     RN 12
min     RN 2


;//Declarations for bSGE4 kernel

q_3b    RN 9
p_3b    RN 0
apqflg  RN 12

P0b     RN 6
P1b     RN 7
P2b     RN 1

Q0b     RN 9
Q1b     RN 0
Q2b     RN 2

;// Miscellanous

a       RN 0
t0      RN 3
t1      RN 12
t2      RN 7
t3      RN 11
t4      RN 4
t5      RN 1
t8      RN 6
t9      RN 14
t10     RN 5
t11     RN 9

;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
;//
;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
;//        - 2 - filt, 0 - apflg, 6 - aqflg
;//        - 11 - m01, 7 - tC0
;//
;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
;//
;// Registers Corrupted - 0-3,5-12,14


        M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr

        ;// Since beta <= 18 and alpha <= 255 we know
        ;// -254 <= p0-q0 <= 254
        ;//  -17 <= q1-q0 <= 17
        ;//  -17 <= p1-p0 <= 17

        ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
        ;//
        ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
        ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
        ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3

        USUB8   t1, p_1, p_0
        MUL     tC0, t2, m01

        USUB8   t2, q_1, q_0
        SSUB8   t1, t1, t2

        USUB8   t2, p_0, q_0
        AND     t2, t2, m01
        SHSUB8  t1, t1, t2
        UHSUB8  t5, p_0, q_0
        SSUB8   t1, t1, t2
        SHSUB8  t1, t1, t5
        MOV     m00, #0
        SADD8   t1, t1, m01
        SHSUB8  t1, t1, t5

        ;// tC = tC0
        ;// if (ap < beta) tC++;
        ;// if (aq < beta) tC++;
        USUB8   t5, filt, m01
        SEL     tC0, tC0, m00
        UQADD8  tC, tC0, apflg
        SSUB8   t1, t1, m00
        UQADD8  tC, tC, aqflg

        ;// Split into positive and negative part and clip
        SEL     pos, t1, m00
        USUB8   neg, pos, t1
        USUB8   t3, pos, tC
        SEL     pos, tC, pos
        USUB8   t3, neg, tC
        SEL     neg, tC, neg

        ;//Reload m01
        LDR     m01,=MASK_1

        UQADD8  P0a, p_0, pos
        UQSUB8  Q0a, q_0, pos
        UQSUB8  P0a, P0a, neg
        UQADD8  Q0a, Q0a, neg

        ;// Choose to store the filtered
        ;// value or the original pixel
        USUB8   t1, filt, m01
        SEL     P0a, P0a, p_0
        SEL     Q0a, Q0a, q_0

        ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
        ;// u1 = (p0 + q0 + 1)>>1
        ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
        MVN     p_0, p_0
        UHSUB8  u1, q_0, p_0
        UQADD8  max, p_1, tC0
        EOR     u1, u1, m01 ,LSL #7

        ;// Calculate A = (p2+u1)>>1
        ;// Then delta = Clip3( -tC0, tC0, A - p1)

        ;// Clip P1
        UHADD8  P1a, p_2, u1
        UQSUB8  min, p_1, tC0
        USUB8   t4, P1a, max
        SEL     P1a, max, P1a
        USUB8   t4, P1a, min
        SEL     P1a, P1a, min

        ;// Clip Q1
        UHADD8  Q1a, q_2, u1
        UQADD8  max, q_1, tC0
        UQSUB8  min, q_1, tC0
        USUB8   t0, Q1a, max
        SEL     Q1a, max, Q1a
        USUB8   t0, Q1a, min
        SEL     Q1a, Q1a, min

        ;// Choose to store the filtered
        ;// value or the original pixel
        USUB8   t0, apflg, m01
        SEL     P1a, P1a, p_1
        USUB8   t0, aqflg, m01
        SEL     t3, Q1a, q_1

        M_END

;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
;//
;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
;//        - 2 - filt, 0 - apflg,aqflg
;//        - 1 - ap0q0, 6 - alpha
;//        - 7 - m00, 11 - m01
;//
;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
;//
;// Registers Corrupted - 0-3,5-12,14

        M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr

        ;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
        ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)

        M_ARG   pDummy,4
        M_ARG   pQ_3,4
        M_ARG   pP_3,4

        UHADD8  alpha, alpha, m00
        USUB8   t9, p_2, p_0    ;//t9 = dp2p0
        UHADD8  alpha, alpha, m00
        ADD     alpha, alpha, m01, LSL #1
        USUB8   ap0q0, ap0q0, alpha
        SEL     apqflg, m00, apflg

        ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
        ;//    = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
        ;//    = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)

        ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
        ;//    = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)

        ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
        ;//    = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
        ;//    = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)

        ;// Compute P0b
        USUB8   t2, p_0, q_0
        SSUB8   t5, t9, t2

        USUB8   t8, q_1, q_0
        SHADD8  t8, t5, t8

        USUB8   t9, p_1, p_0
        SADD8   t8, t8, t9
        SHSUB8  t8, t8, t2
        SHADD8  t5, t5, t9
        SHADD8  t8, t8, m01
        SHADD8  t9, t5, m01
        SADD8   P0b, p_0, t8
        ;// P0b ready

        ;// Compute P1b
        M_LDR   p_3b, pP_3
        SADD8   P1b, p_0, t9
        ;// P1b ready

        ;// Compute P2b
        USUB8   t9, p_2, p_0
        SADD8   t5, t5, t9
        UHSUB8  t9, p_3b, p_0
        EOR     a, p_3b, p_0
        AND     a, a, m01
        SHADD8  t5, t5, a
        UHADD8  a, p_0, q_1
        SADD8   t5, t5, m01
        SHADD8  t5, t5, t9
        MVN     t9, p_1
        SADD8   P2b, p_0, t5
        ;// P2b ready

        UHSUB8  a, a, t9
        ORR     t9, apqflg, m01
        USUB8   t9, apqflg, t9

        EOR     a, a, m01, LSL #7
        SEL     P0b, P0b, a
        SEL     P1b, P1b, p_1
        SEL     P2b, P2b, p_2

        USUB8   t4, filt, m01
        SEL     P0b, P0b, p_0


        ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
        ;//    = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
        ;//    = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)

        ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
        ;//    = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)

        ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
        ;//    = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
        ;//    = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)


        ;// Compute Q0b Q1b
        USUB8   t4, q_2, q_0
        USUB8   a, p_0, q_0
        USUB8   t9, p_1, p_0
        SADD8   t0, t4, a
        SHADD8  t9, t0, t9
        UHADD8  t10, q_0, p_1
        SADD8   t9, t9, a
        USUB8   a, q_1, q_0
        SHADD8  t9, t9, a
        SHADD8  t0, t0, a
        SHADD8  t9, t9, m01
        SHADD8  a, t0, m01
        SADD8   t9, q_0, t9
        ;// Q0b ready - t9

        MOV     t4, #0
        UHADD8  apqflg, apqflg, t4

        SADD8   Q1b, q_0, a
        ;// Q1b ready

        USUB8   t4, apqflg, m01
        SEL     Q1b, Q1b, q_1
        MVN     t11, q_1
        UHSUB8  t10, t10, t11
        M_LDR   q_3b, pQ_3
        EOR     t10, t10, m01, LSL #7
        SEL     t9, t9, t10

        ;// Compute Q2b
        USUB8   t4, q_2, q_0
        SADD8   t4, t0, t4
        EOR     t0, q_3b, q_0
        AND     t0, t0, m01
        SHADD8  t4, t4, t0
        UHSUB8  t10, q_3b, q_0
        SADD8   t4, t4, m01
        SHADD8  t4, t4, t10

        USUB8   t10, filt, m01
        SEL     Q0b, t9, q_0

        SADD8   t4, q_0, t4
        ;// Q2b ready - t4

        USUB8   t10, apqflg, m01
        SEL     Q2b, t4, q_2

        M_END

    ENDIF

        END