1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@ *******************************************************************************
22@ * @file
23@ *  ih264_ihadamard_scaling_a9.s
24@ *
25@ * @brief
26@ *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
27@ *  of 16x16 intra-prediction
28@ *
29@ * @author
30@ *  Mohit
31@ *
32@ * @par List of Functions:
33@ *  - ih264_ihadamard_scaling_4x4_a9()
34@ *  - ih264_ihadamard_scaling_2x2_uv_a9()
35@ *
36@ * @remarks
37@ *  None
38@ *
39@ *******************************************************************************
40@ *
41@ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
42@ * of a 16x16 intra prediction macroblock, and then performs scaling.
43@ * prediction buffer
44@ *
45@ * @par Description:
46@ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
47@ *  This inverse transformed content is scaled to based on Qp value.
48@ *
49@ * @param[in] pi2_src
50@ *  input 4x4 block of DC coefficients
51@ *
52@ * @param[out] pi2_out
53@ *  output 4x4 block
54@ *
55@ * @param[in] pu2_iscal_mat
56@ *  pointer to scaling list
57@ *
58@ * @param[in] pu2_weigh_mat
59@ *  pointer to weight matrix
60@ *
61@ * @param[in] u4_qp_div_6
62@ *  Floor (qp/6)
63@ *
64@ * @param[in] pi4_tmp
65@ * temporary buffer of size 1*16
66@ *
67@ * @returns none
68@ *
69@ * @remarks none
70@ *
71@ *******************************************************************************
72@ *
73@ *
74@ *******************************************************************************
75@ *
76@ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
77@       WORD16* pi2_out,
78@       const UWORD16 *pu2_iscal_mat,
79@       const UWORD16 *pu2_weigh_mat,
80@       UWORD32 u4_qp_div_6,
81@       WORD32* pi4_tmp)
82@**************Variables Vs Registers*****************************************
83@r0 => *pi2_src
84@r1 => *pi2_out
85@r2 =>  *pu2_iscal_mat
86@r3 =>  *pu2_weigh_mat
87@r4 =>  u4_qp_div_6
88
89.text
90.p2align 2
91
92    .global ih264_ihadamard_scaling_4x4_a9
93
94ih264_ihadamard_scaling_4x4_a9:
95
96@VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
97@If the macro value changes need to change the instruction according to it.
98@Only one shift is done in horizontal inverse because,
99@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
100@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
101
102    stmfd         sp!, {r4-r12, r14}    @ stack stores the values of the arguments
103    ldr           r4, [sp, #40]         @ Loads u4_qp_div_6
104    vdup.s32      q10, r4               @ Populate the u4_qp_div_6 in Q10
105    ldrh          r6, [r3]              @ load pu2_weight_mat[0] , H for unsigned halfword load
106    ldrh          r7, [r2]              @ load pu2_iscal_mat[0] , H for unsigned halfword load
107    mul           r6, r6, r7            @ pu2_iscal_mat[0]*pu2_weigh_mat[0]
108    vdup.s32      q9, r6                @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9
109    vpush         {d8-d15}
110@=======================INVERSE HADAMARD TRANSFORM================================
111
112    vld4.s16      {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7
113    vaddl.s16     q12, d0, d3           @x0 = x4 + x7
114    vaddl.s16     q13, d1, d2           @x1 = x5 + x6
115    vsubl.s16     q14, d1, d2           @x2 = x5 - x6
116    vsubl.s16     q15, d0, d3           @x3 = x4 - x7
117
118    vadd.s32      q2, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
119    vadd.s32      q3, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
120    vsub.s32      q4, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
121    vsub.s32      q5, q15, q14          @pi4_tmp_ptr[3] = x3 - x2
122
123    vtrn.32       q2, q3                @Transpose the register for vertical transform
124    vtrn.32       q4, q5
125
126    vswp          d5, d8                @Q2 = x4, Q4 = x6
127    vswp          d7, d10               @Q3 = x5, Q5 = x7
128
129
130    vadd.s32      q12, q2, q5           @x0 = x4+x7
131    vadd.s32      q13, q3, q4           @x1 = x5+x6
132    vsub.s32      q14, q3, q4           @x2 = x5-x6
133    vsub.s32      q15, q2, q5           @x3 = x4-x7
134
135    vadd.s32      q0, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
136    vadd.s32      q1, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
137    vsub.s32      q2, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
138    vsub.s32      q3, q15, q14          @pi4_tmp_ptr[3] = x3 - x2
139
140
141    vmul.s32      q0, q0, q9            @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
142    vmul.s32      q1, q1, q9            @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
143    vmul.s32      q2, q2, q9            @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
144    vmul.s32      q3, q3, q9            @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
145
146    vshl.s32      q0, q0, q10           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
147    vshl.s32      q1, q1, q10           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
148    vshl.s32      q2, q2, q10           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
149    vshl.s32      q3, q3, q10           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
150
151    vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
152    vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
153    vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
154    vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
155
156    vst1.s16      {d0, d1, d2, d3}, [r1] @IV row store the value
157
158    vpop          {d8-d15}
159    ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
160
161
162
163@ *******************************************************************************
164@ *
165@ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
166@ *
167@ * @par Description:
168@ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
169@ *  This inverse transformed content is scaled to based on Qp value.
170@ *  Both DC blocks of U and v blocks are processesd
171@ *
172@ * @param[in] pi2_src
173@ *  input 1x8 block of ceffs. First 4 are from U and next from V
174@ *
175@ * @param[out] pi2_out
176@ *  output 1x8 block
177@ *
178@ * @param[in] pu2_iscal_mat
179@ *  pointer to scaling list
180@ *
181@ * @param[in] pu2_weigh_mat
182@ *  pointer to weight matrix
183@ *
184@ * @param[in] u4_qp_div_6
185@ *  Floor (qp/6)
186@ *
187@ * @returns none
188@ *
189@ * @remarks none
190@ *
191@ *******************************************************************************
192@ *
193@ *
194@ *******************************************************************************
195@ *
196@ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
197@                                  WORD16* pi2_out,
198@                                  const UWORD16 *pu2_iscal_mat,
199@                                  const UWORD16 *pu2_weigh_mat,
200@                                  UWORD32 u4_qp_div_6,
201
202    .global ih264_ihadamard_scaling_2x2_uv_a9
203ih264_ihadamard_scaling_2x2_uv_a9:
204
205@Registers used
206@   r0 : *pi2_src
207@   r1 : *pi2_out
208@   r2 : *pu2_iscal_mat
209@   r3 : *pu2_weigh_mat
210
211    vld1.u16      d26[0], [r2]
212    vld1.u16      d27[0], [r3]
213    vmull.u16     q15, d26, d27         @pu2_iscal_mat[0] *  pu2_weigh_mat[0]
214    vdup.u32      q15, d30[0]
215
216    vld1.u16      d28[0], [sp]          @load qp/6
217
218    vpush         {d8-d15}
219
220    vmov.u16      d29, #5
221    vsubl.u16     q14, d28, d29         @qp\6 - 5
222    vdup.s32      q14, d28[0]
223
224    vld2.s16      {d0, d1}, [r0]        @load 8 dc coeffs
225                                        @i2_x4,i2_x6,i2_y4,i1_y6 -> d0
226                                        @i2_x5,i2_x7,i2_y5,i1_y6 -> d1
227
228    vaddl.s16     q1, d0, d1            @  i4_x0 = i4_x4 + i4_x5;...x2
229    vsubl.s16     q2, d0, d1            @  i4_x1 = i4_x4 - i4_x5;...x3
230
231    vtrn.s32      q1, q2                @i4_x0 i4_x1 -> q1
232
233    vadd.s32      q3, q1, q2            @i4_x4 = i4_x0+i4_x2;.. i4_x5
234    vsub.s32      q1, q1, q2            @i4_x6 = i4_x0-i4_x2;.. i4_x7
235
236    vmul.s32      q5, q3, q15
237    vmul.s32      q6, q1, q15
238
239    vshl.s32      q7, q5, q14
240    vshl.s32      q8, q6, q14
241
242    vmovn.s32     d18, q7               @i4_x4 i4_x5 i4_y4 i4_y5
243    vmovn.s32     d19, q8               @i4_x6 i4_x7 i4_y6 i4_y7
244
245    vst2.s32      {d18-d19}, [r1]
246
247    vpop          {d8-d15}
248    bx            lr
249
250
251