1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16; **********
17; *
18; * File Name:  omxVCM4P2_PredictReconCoefIntra_s.s
19; * OpenMAX DL: v1.0.2
20; * Revision:   12290
21; * Date:       Wednesday, April 9, 2008
22; *
23; *
24; *
25; *
26; * Description:
27; * Contains module for DC/AC coefficient prediction
28; *
29; *
30; * Function: omxVCM4P2_PredictReconCoefIntra
31; *
32; * Description:
33; * Performs adaptive DC/AC coefficient prediction for an intra block. Prior
34; * to the function call, prediction direction (predDir) should be selected
35; * as specified in subclause 7.4.3.1 of ISO/IEC 14496-2.
36; *
37; * Remarks:
38; *
39; * Parameters:
40; * [in]  pSrcDst      pointer to the coefficient buffer which contains the
41; *                    quantized coefficient residuals (PQF) of the current
42; *                    block; must be aligned on a 4-byte boundary. The
43; *                    output coefficients are saturated to the range
44; *                    [-2048, 2047].
45; * [in]  pPredBufRow  pointer to the coefficient row buffer; must be aligned
46; *                    on a 4-byte boundary.
47; * [in]  pPredBufCol  pointer to the coefficient column buffer; must be
48; *                    aligned on a 4-byte boundary.
49; * [in]  curQP        quantization parameter of the current block. curQP may
50; *                    equal to predQP especially when the current block and
51; *                    the predictor block are in the same macroblock.
52; * [in]  predQP       quantization parameter of the predictor block
53; * [in]  predDir      indicates the prediction direction which takes one
54; *                    of the following values:
55; *                    OMX_VIDEO_HORIZONTAL    predict horizontally
56; *                    OMX_VIDEO_VERTICAL        predict vertically
57; * [in]  ACPredFlag   a flag indicating if AC prediction should be
58; *                    performed. It is equal to ac_pred_flag in the bit
59; *                    stream syntax of MPEG-4
60; * [in]  videoComp    video component type (luminance, chrominance or
61; *                    alpha) of the current block
62; * [out] pSrcDst      pointer to the coefficient buffer which contains
63; *                    the quantized coefficients (QF) of the current
64; *                    block
65; * [out] pPredBufRow  pointer to the updated coefficient row buffer
66; * [out] pPredBufCol  pointer to the updated coefficient column buffer
67; * Return Value:
68; * OMX_Sts_NoErr - no error
69; * OMX_Sts_BadArgErr - Bad arguments
70; * - At least one of the pointers is NULL: pSrcDst, pPredBufRow, or pPredBufCol.
71; * - At least one the following cases: curQP <= 0, predQP <= 0, curQP >31,
72; *   predQP > 31, preDir exceeds [1,2].
73; * - At least one of the pointers pSrcDst, pPredBufRow, or pPredBufCol is not
74; *   4-byte aligned.
75; *
76; *********
77
78        INCLUDE omxtypes_s.h
79        INCLUDE armCOMM_s.h
80
81       M_VARIANTS CortexA8
82
83
84
85       IMPORT        armVCM4P2_Reciprocal_QP_S32
86       IMPORT        armVCM4P2_Reciprocal_QP_S16
87       IMPORT        armVCM4P2_DCScaler
88
89        IF CortexA8
90;// Input Arguments
91
92pSrcDst          RN 0
93pPredBufRow      RN 1
94pPredBufCol      RN 2
95curQP            RN 3
96QP               RN 3
97predQP           RN 4
98predDir          RN 5
99ACPredFlag       RN 6
100videoComp        RN 7
101
102;// Local Variables
103
104shortVideoHeader RN 4
105dcScaler         RN 4
106index            RN 6
107predCoeffTable   RN 7
108temp1            RN 6
109temp2            RN 9
110temp             RN 14
111Const            RN 8
112temppPredColBuf  RN 8
113tempPred         RN 9
114
115absCoeffDC       RN 8
116negdcScaler      RN 10
117Rem              RN 11
118temp3            RN 12
119
120dcRowbufCoeff    RN 10
121dcColBuffCoeff   RN 11
122Return           RN 0
123
124;//NEON Registers
125
126qPredRowBuf       QN Q0.S16
127dPredRowBuf0      DN D0.S16
128dPredRowBuf1      DN D1.S16
129
130
131
132
133qCoeffTab         QN Q1.S32
134
135qPredQP           QN Q2.S16
136dPredQP0          DN D4.S16
137dPredQP1          DN D5.S16
138
139
140qtemp1            QN Q3.S32
141qtemp             QN Q3.S16
142
143dtemp0            DN D6.S16
144dtemp1            DN D7.S16
145
146dtemp2            DN D8.S16
147dtemp3            DN D9.S16
148
149dtemp4            DN D2.S16
150dtemp5            DN D3.S16
151dtemp6            DN D4.S16
152dtemp7            DN D5.S16
153
154qtempPred1        QN Q5.S32
155qtempPred         QN Q5.S16
156
157dtempPred0        DN D10.S16
158dtempPred1        DN D11.S16
159
160
161
162      M_START   omxVCM4P2_PredictReconCoefIntra,r11,d11
163
164      ;// Assigning pointers to Input arguments on Stack
165
166      M_ARG           predQPonStack,4
167      M_ARG           predDironStack,4
168      M_ARG           ACPredFlagonStack,4
169      M_ARG           videoComponStack,4
170
171      ;// DC Prediction
172
173      M_LDR           videoComp,videoComponStack                     ;// Load videoComp From Stack
174
175      M_LDR           predDir,predDironStack                         ;// Load Prediction direction
176      ;// DC Scaler calculation
177      LDR             index, =armVCM4P2_DCScaler
178      ADD             index,index,videoComp,LSL #5
179      LDRB            dcScaler,[index,QP]
180
181
182      LDR             predCoeffTable, =armVCM4P2_Reciprocal_QP_S16   ;// Loading the table with entries 32767/(1 to 63)
183      CMP             predDir,#2                                     ;// Check if the Prediction direction is vertical
184
185      ;// Caulucate tempPred
186
187      LDREQSH         absCoeffDC,[pPredBufRow]                       ;// If vetical load the coeff from Row Prediction Buffer
188      LDRNESH         absCoeffDC,[pPredBufCol]                       ;// If horizontal load the coeff from column Prediction Buffer
189
190      RSB             negdcScaler,dcScaler,#0                        ;// negdcScaler=-dcScaler
191      MOV             temp1,absCoeffDC                               ;// Load the Prediction coeff to temp for comparision
192      CMP             temp1,#0
193      RSBLT           absCoeffDC,temp1,#0                            ;// calculate absolute val of prediction coeff
194
195      ADD             temp,dcScaler,dcScaler
196      LDRH            temp,[predCoeffTable,temp]                     ;// Load value from coeff table for performing division using multiplication
197      SMULBB          tempPred,temp,absCoeffDC                       ;// tempped=pPredBufRow(Col)[0]*32767/dcScaler
198      ADD             temp3,dcScaler,#1
199      LSR             tempPred,tempPred,#15                          ;// tempped=pPredBufRow(Col)[0]/dcScaler
200      LSR             temp3,temp3,#1                                 ;// temp3=round(dcScaler/2)
201      MLA             Rem,negdcScaler,tempPred,absCoeffDC            ;// Remainder Rem=abs(pPredBufRow(Col)[0])-tempPred*dcScaler
202
203      LDRH            dcRowbufCoeff,[pPredBufCol]
204
205      CMP             Rem,temp3                                      ;// compare Rem with (dcScaler/2)
206      ADDGE           tempPred,#1                                    ;// tempPred=tempPred+1 if Rem>=(dcScaler/2)
207      CMP             temp1,#0
208      RSBLT           tempPred,tempPred,#0                           ;// tempPred=-tempPred if
209
210      STRH            dcRowbufCoeff,[pPredBufRow,#-16]
211
212
213      LDRH            temp,[pSrcDst]                                 ;// temp=pSrcDst[0]
214      ADD             temp,temp,tempPred                             ;// temp=pSrcDst[0]+tempPred
215      SSAT16          temp,#12,temp                                  ;// clip temp to [-2048,2047]
216      SMULBB          dcColBuffCoeff,temp,dcScaler                   ;// temp1=clipped(pSrcDst[0])*dcScaler
217      M_LDR           ACPredFlag,ACPredFlagonStack
218      STRH            dcColBuffCoeff,[pPredBufCol]
219
220
221       ;// AC Prediction
222
223      M_LDR           predQP,predQPonStack
224
225      CMP             ACPredFlag,#1                                  ;// Check if the AC prediction flag is set or not
226      BNE             Exit                                           ;// If not set Exit
227      CMP             predDir,#2                                     ;// Check the Prediction direction
228      LDR             predCoeffTable, =armVCM4P2_Reciprocal_QP_S32   ;// Loading the table with entries 0x1ffff/(1 to 63)
229      MOV             Const,#4
230      MUL             curQP,curQP,Const                              ;// curQP=4*curQP
231      VDUP            dPredQP0,predQP
232      LDR             temp2,[predCoeffTable,curQP]                   ;// temp=0x1ffff/curQP
233      VDUP            qCoeffTab,temp2
234      BNE             Horizontal                                     ;// If the Prediction direction is horizontal branch to Horizontal
235
236
237
238      ;// Vertical
239      ;//Calculating tempPred
240
241      VLD1            {dPredRowBuf0,dPredRowBuf1},[pPredBufRow]      ;// Loading pPredBufRow[i]:i=0 t0 7
242
243      VMULL           qtemp1,dPredRowBuf0,dPredQP0                   ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i]: i=0 t0 3
244      VMUL            qtempPred1,qtemp1,qCoeffTab                    ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3
245
246      VMULL           qtemp1,dPredRowBuf1,dPredQP0                   ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i] : i=4 t0 7
247
248      VRSHR           qtempPred1,qtempPred1,#17                      ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP) : i=0 t0 3
249      VSHRN           dPredQP1,qtempPred1,#0                         ;// narrow qtempPred1[i] to 16 bits
250
251
252      VMUL            qtempPred1,qtemp1,qCoeffTab                    ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7
253      VRSHR           qtempPred1,qtempPred1,#17                      ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP)  : i=4 t0 7
254      VLD1            {dtemp0,dtemp1},[pSrcDst]                      ;//Loading pSrcDst[i] : i=0 to 7
255      VSHRN           dtempPred1,qtempPred1,#0                       ;// narrow qtempPred1[i] to 16 bits
256      VMOV            dtempPred0,dPredQP1
257
258      ;//updating source and row prediction buffer contents
259      VADD            qtemp,qtemp,qtempPred                          ;//pSrcDst[i]=pSrcDst[i]+qtempPred[i]: i=0 to 7
260      VQSHL           qtemp,qtemp,#4                                 ;//Clip to [-2048,2047]
261      LDRH            dcRowbufCoeff,[pPredBufRow]                    ;//Loading Dc Value of Row Prediction buffer
262      VSHR            qtemp,qtemp,#4
263
264      VST1            {dtemp0,dtemp1},[pSrcDst]                      ;//storing back the updated values
265      VST1            {dtemp0,dtemp1},[pPredBufRow]                  ;//storing back the updated row prediction values
266      STRH            dcRowbufCoeff,[pPredBufRow]                    ;// storing the updated DC Row Prediction coeff
267
268      B               Exit
269
270Horizontal
271
272      ;// Calculating Temppred
273
274
275
276      VLD1            {dPredRowBuf0,dPredRowBuf1},[pPredBufCol]      ;// Loading pPredBufCol[i]:i=0 t0 7
277      VMULL           qtemp1,dPredRowBuf0,dPredQP0                   ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i]: i=0 t0 3
278      VMUL            qtempPred1,qtemp1,qCoeffTab                    ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3
279
280      VMULL           qtemp1,dPredRowBuf1,dPredQP0                   ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i] : i=4 t0 7
281
282      VRSHR           qtempPred1,qtempPred1,#17                      ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP) : i=0 t0 3
283      VSHRN           dPredQP1,qtempPred1,#0                         ;// narrow qtempPred1[i] to 16 bits
284
285
286      VMUL            qtempPred1,qtemp1,qCoeffTab                    ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7
287
288      MOV             temppPredColBuf,pPredBufCol
289      VRSHR           qtempPred1,qtempPred1,#17                      ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP)  : i=4 t0 7
290      VLD4            {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst]        ;// Loading coefficients Interleaving by 4
291      VSHRN           dtempPred1,qtempPred1,#0                       ;// narrow qtempPred1[i] to 16 bits
292      VMOV            dtempPred0,dPredQP1
293
294      ;// Updating source and column prediction buffer contents
295      ADD             temp2,pSrcDst,#32
296      VLD4            {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]          ;// Loading next 16 coefficients Interleaving by 4
297      VUZP            dtemp0,dtemp4                                  ;// Interleaving by 8
298      VADD            dtemp0,dtemp0,dtempPred0                       ;// Adding tempPred to coeffs
299      VQSHL           dtemp0,dtemp0,#4                               ;// Clip to [-2048,2047]
300      VSHR            dtemp0,dtemp0,#4
301      VST1            {dtemp0},[pPredBufCol]!                        ;// Updating Pridiction column buffer
302      VZIP            dtemp0,dtemp4                                  ;// deinterleaving
303      VST4            {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst]        ;// Updating source coeffs
304      VST4            {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]!
305
306      MOV             temp1,temp2
307      VLD4            {dtemp0,dtemp1,dtemp2,dtemp3},[temp2]!         ;// Loading  coefficients Interleaving by 4
308
309      VLD4            {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]
310      VUZP            dtemp0,dtemp4                                  ;// Interleaving by 8
311      VADD            dtemp0,dtemp0,dtempPred1
312      VQSHL           dtemp0,dtemp0,#4                               ;// Clip to [-2048,2047]
313      VSHR            dtemp0,dtemp0,#4
314      VST1            {dtemp0},[pPredBufCol]!
315      VZIP            dtemp0,dtemp4
316      VST4            {dtemp0,dtemp1,dtemp2,dtemp3},[temp1]
317      STRH            dcColBuffCoeff,[temppPredColBuf]
318      VST4            {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]
319
320Exit
321
322      STRH            temp,[pSrcDst]
323
324
325      MOV             Return,#OMX_Sts_NoErr
326
327      M_END
328      ENDIF
329
330
331       END
332
333
334
335