1///*****************************************************************************
2//*
3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************/
18///**
19// *******************************************************************************
20// * @file
21// *  ihevc_itrans_recon_4x4_neon.s
22// *
23// * @brief
24// *  contains function definitions for single stage  inverse transform
25// *
26// * @author
27// *     naveen sr
28// *
29// * @par list of functions:
30// *  - ihevc_itrans_recon_4x4()
31// *
32// * @remarks
33// *  none
34// *
35// *******************************************************************************
36//*/
37// /**
38// *******************************************************************************
39// *
40// * @brief
41// *  this function performs inverse transform  and reconstruction for 4x4
42// * input block
43// *
44// * @par description:
45// *  performs inverse transform and adds the prediction  data and clips output
46// * to 8 bit
47// *
48// * @param[in] pi2_src
49// *  input 4x4 coefficients
50// *
51// * @param[in] pi2_tmp
52// *  temporary 4x4 buffer for storing inverse
53// *
54// *  transform
55// *  1st stage output
56// *
57// * @param[in] pu1_pred
58// *  prediction 4x4 block
59// *
60// * @param[out] pu1_dst
61// *  output 4x4 block
62// *
63// * @param[in] src_strd
64// *  input stride
65// *
66// * @param[in] pred_strd
67// *  prediction stride
68// *
69// * @param[in] dst_strd
70// *  output stride
71// *
72// * @param[in] shift
73// *  output shift
74// *
75// * @param[in] zero_cols
76// *  zero columns in pi2_src
77// *
78// * @returns  void
79// *
80// * @remarks
81// *  none
82// *
83// *******************************************************************************
84// */
85//void ihevc_itrans_recon_4x4(word16 *pi2_src,
86//        word16 *pi2_tmp,
87//        uword8 *pu1_pred,
88//        uword8 *pu1_dst,
89//        word32 src_strd,
90//        word32 pred_strd,
91//        word32 dst_strd,
92//        word32 zero_cols)
93//**************variables vs registers*************************
94//    x0 => *pi2_src
95//    x1 => *pi2_tmp
96//    x2 => *pu1_pred
97//    x3 => *pu1_dst
98//    x4 => src_strd
99//    x5 => pred_strd
100//    x6 => dst_strd
101//    x7 => zero_cols
102
103.text
104.align 4
105
106.include "ihevc_neon_macros.s"
107
108.set shift_stage1_idct ,   7
109.set shift_stage2_idct ,   12
110
111
112
113.globl ihevc_itrans_recon_4x4_av8
114
115.extern g_ai2_ihevc_trans_4_transpose
116
117.type ihevc_itrans_recon_4x4_av8, %function
118
119ihevc_itrans_recon_4x4_av8:
120
121    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
122
123    stp         x19, x20,[sp,#-16]!
124
125    adrp        x8, :got:g_ai2_ihevc_trans_4_transpose
126    ldr         x8, [x8, #:got_lo12:g_ai2_ihevc_trans_4_transpose]
127
128    add         x4,x4,x4                    // src_strd in terms of word16
129    add         x9,x0,x4                    // pi2_src[0] + src_strd
130
131    ld1         {v4.4h},[x8]                //loading first row of g_ai2_ihevc_trans_4_transpose
132    // d4 = {36,64,83,64}
133    //index = 3  2  1  0
134    add         x10,x9,x4, lsl #1           // 3*src_strd
135    add         x4,x4,x4
136    ld1         {v1.4h},[x9]                //loading pi2_src 2nd row
137    ld1         {v3.4h},[x10]               //loading pi2_src 4th row
138    ld1         {v0.4h},[x0],x4             //loading pi2_src 1st row
139    ld1         {v2.4h},[x0],x4             //loading pi2_src 3rd row
140
141
142    // first stage computation starts
143    smull       v6.4s, v1.4h, v4.h[1]       //83 * pi2_src[1]
144    smlal       v6.4s, v3.4h, v4.h[3]       //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
145    smull       v5.4s, v1.4h, v4.h[3]       //36 * pi2_src[1]
146    ld1         {v22.s}[0],[x2],x5
147    smlsl       v5.4s, v3.4h, v4.h[1]       //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
148
149    saddl       v7.4s, v0.4h, v2.4h         //pi2_src[0] + pi2_src[2]
150    ssubl       v17.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
151    shl         v7.4s, v7.4s,#6             //e[0] = 64*(pi2_src[0] + pi2_src[2])
152    shl         v17.4s, v17.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
153
154    add         v19.4s,  v7.4s ,  v6.4s     //((e[0] + o[0] )
155    add         v16.4s,  v17.4s ,  v5.4s    //((e[1] + o[1])
156    sub         v18.4s,  v17.4s ,  v5.4s    //((e[1] - o[1])
157    sub         v20.4s,  v7.4s ,  v6.4s     //((e[0] - o[0])
158
159    sqrshrn     v28.4h, v19.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
160    sqrshrn     v29.4h, v16.4s,#shift_stage1_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
161    sqrshrn     v30.4h, v18.4s,#shift_stage1_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
162    sqrshrn     v31.4h, v20.4s,#shift_stage1_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
163
164    trn1        v24.4h, v28.4h, v29.4h
165    trn2        v25.4h, v28.4h, v29.4h
166    trn1        v26.4h, v30.4h, v31.4h
167    trn2        v27.4h, v30.4h, v31.4h
168    trn1        v0.2s, v24.2s, v26.2s
169    trn2        v2.2s, v24.2s, v26.2s
170    trn1        v1.2s, v25.2s, v27.2s
171    trn2        v3.2s, v25.2s, v27.2s
172
173    // first stage ends
174    // output in d0,d1,d2,d3
175    // second stage starts
176    smull       v6.4s, v1.4h, v4.h[1]       //83 * pi2_src[1]
177    ld1         {v22.s}[1],[x2],x5
178    smlal       v6.4s, v3.4h, v4.h[3]       //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
179    smull       v5.4s, v1.4h, v4.h[3]       //36 * pi2_src[1]
180    smlsl       v5.4s, v3.4h, v4.h[1]       //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
181    ld1         {v23.s}[0],[x2],x5
182
183    saddl       v7.4s, v0.4h, v2.4h         //pi2_src[0] + pi2_src[2]
184    ssubl       v17.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
185    shl         v7.4s, v7.4s,#6             //e[0] = 64*(pi2_src[0] + pi2_src[2])
186    shl         v17.4s, v17.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
187
188
189    add         v19.4s,  v7.4s ,  v6.4s     //((e[0] + o[0] )
190    add         v16.4s,  v17.4s ,  v5.4s    //((e[1] + o[1])
191    sub         v18.4s,  v17.4s ,  v5.4s    //((e[1] - o[1])
192    sub         v20.4s,  v7.4s ,  v6.4s     //((e[0] - o[0])
193
194    sqrshrn     v28.4h, v19.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
195    sqrshrn     v29.4h, v16.4s,#shift_stage2_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
196    sqrshrn     v30.4h, v18.4s,#shift_stage2_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
197    sqrshrn     v31.4h, v20.4s,#shift_stage2_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
198    ld1         {v23.s}[1],[x2],x5
199
200    trn1        v24.4h, v28.4h, v29.4h
201    trn2        v25.4h, v28.4h, v29.4h
202    trn1        v26.4h, v30.4h, v31.4h
203    trn2        v27.4h, v30.4h, v31.4h
204    trn1        v0.2s, v24.2s, v26.2s
205    trn2        v2.2s, v24.2s, v26.2s
206    trn1        v1.2s, v25.2s, v27.2s
207    trn2        v3.2s, v25.2s, v27.2s
208    // second stage ends
209    // output in d0,d1,d2,d3
210    // second stage computation ends
211
212    // loading pred
213
214    mov         v0.d[1],v1.d[0]
215    mov         v2.d[1],v3.d[0]
216
217    uaddw       v0.8h,  v0.8h ,  v22.8b     // pi2_out(16bit) + pu1_pred(8bit)
218    uaddw       v2.8h,  v2.8h ,  v23.8b     // pi2_out(16bit) + pu1_pred(8bit)
219    sqxtun      v0.8b, v0.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
220    sqxtun      v1.8b, v2.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
221
222    // storing destination
223    st1         {v0.s}[0],[x3],x6
224    st1         {v0.s}[1],[x3],x6
225    st1         {v1.s}[0],[x3],x6
226    st1         {v1.s}[1],[x3],x6
227
228
229    // ldmfd sp!,{x4-x12,x15}                //reload the registers from sp
230    ldp         x19, x20,[sp],#16
231
232    ret
233
234
235
236
237
238