1;//
2;// Copyright (C) 2007-2008 ARM Limited
3;//
4;// Licensed under the Apache License, Version 2.0 (the "License");
5;// you may not use this file except in compliance with the License.
6;// You may obtain a copy of the License at
7;//
8;//      http://www.apache.org/licenses/LICENSE-2.0
9;//
10;// Unless required by applicable law or agreed to in writing, software
11;// distributed under the License is distributed on an "AS IS" BASIS,
12;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13;// See the License for the specific language governing permissions and
14;// limitations under the License.
15;//
16;//
17;//
18;// File Name:  armVCM4P10_Interpolate_Chroma_s.s
19;// OpenMAX DL: v1.0.2
20;// Revision:   9641
21;// Date:       Thursday, February 7, 2008
22;//
23;//
24;//
25;//
26
27
28        INCLUDE omxtypes_s.h
29        INCLUDE armCOMM_s.h
30
31        M_VARIANTS CortexA8
32
33
34    IF CortexA8
35
36    M_TABLE armVCM4P10_WidthBranchTableMVIsNotZero
37
38    DCD   WidthIs2MVIsNotZero, WidthIs2MVIsNotZero
39    DCD   WidthIs4MVIsNotZero, WidthIs4MVIsNotZero
40    DCD   WidthIs8MVIsNotZero
41
42    M_TABLE armVCM4P10_WidthBranchTableMVIsZero
43
44    DCD   WidthIs2MVIsZero, WidthIs2MVIsZero
45    DCD   WidthIs4MVIsZero, WidthIs4MVIsZero
46    DCD   WidthIs8MVIsZero
47
48
49;// input registers
50
51pSrc                 RN 0
52iSrcStep             RN 1
53pDst                 RN 2
54iDstStep             RN 3
55iWidth               RN 4
56iHeight              RN 5
57dx                   RN 6
58dy                   RN 7
59
60;// local variable registers
61pc                   RN 15
62return               RN 0
63EightMinusdx         RN 8
64EightMinusdy         RN 9
65
66ACoeff               RN 12
67BCoeff               RN 9
68CCoeff               RN 8
69DCoeff               RN 6
70
71pTable               RN 11
72
73Step1                RN 10
74SrcStepMinus1        RN 14
75
76dACoeff              DN D12.U8
77dBCoeff              DN D13.U8
78dCCoeff              DN D14.U8
79dDCoeff              DN D15.U8
80
81dRow0a               DN D0.U8
82dRow0b               DN D1.U8
83dRow1a               DN D2.U8
84dRow1b               DN D3.U8
85
86qRow0a               QN Q2.S16
87qRow0b               QN Q3.S16
88
89;//dIndex               DN    D16.U8
90qRow1a               QN Q11.S16
91qRow1b               QN Q12.S16
92
93dRow2a               DN D16.U8
94dRow2b               DN D17.U8
95dRow3a               DN D18.U8
96dRow3b               DN D19.U8
97
98qOutRow2             QN Q11.U16
99qOutRow3             QN Q12.U16
100dOutRow2             DN D20.U8
101dOutRow3             DN D21.U8
102dOutRow2U64          DN D20.U64
103dOutRow3U64          DN D21.U64
104
105qOutRow0             QN Q2.U16
106qOutRow1             QN Q3.U16
107dOutRow0             DN D8.U8
108dOutRow1             DN D9.U8
109
110dOutRow0U64          DN D8.U64
111dOutRow1U64          DN D9.U64
112
113dOutRow0U32          DN D8.U32
114dOutRow1U32          DN D9.U32
115
116dOutRow0U16          DN D8.U16
117dOutRow1U16          DN D9.U16
118
119
120dOut0U64             DN D0.U64
121dOut1U64             DN D1.U64
122
123dOut00U32            DN D0.U32
124dOut01U32            DN D1.U32
125dOut10U32            DN D2.U32
126dOut11U32            DN D3.U32
127
128dOut0U16             DN D0.U16
129dOut1U16             DN D1.U16
130
131;//-----------------------------------------------------------------------------------------------
132;// armVCM4P10_Interpolate_Chroma_asm starts
133;//-----------------------------------------------------------------------------------------------
134
135        ;// Write function header
136        M_START armVCM4P10_Interpolate_Chroma, r11, d15
137
138        ;// Define stack arguments
139        M_ARG   Width,      4
140        M_ARG   Height,     4
141        M_ARG   Dx,         4
142        M_ARG   Dy,         4
143
144        ;// Load argument from the stack
145        ;// M_STALL ARM1136JS=4
146
147        M_LDRD   dx, dy, Dx
148        M_LDRD   iWidth, iHeight, Width
149
150        ;// EightMinusdx = 8 - dx
151        ;// EightMinusdy = 8 - dy
152
153        ;// ACoeff = EightMinusdx * EightMinusdy
154        ;// BCoeff = dx * EightMinusdy
155        ;// CCoeff = EightMinusdx * dy
156        ;// DCoeff = dx * dy
157
158        RSB     EightMinusdx, dx, #8
159        RSB     EightMinusdy, dy, #8
160        CMN     dx,dy
161        MOV     Step1, #1
162        LDREQ   pTable, =armVCM4P10_WidthBranchTableMVIsZero
163        SUB     SrcStepMinus1, iSrcStep, Step1
164        LDRNE   pTable, =armVCM4P10_WidthBranchTableMVIsNotZero
165
166        VLD1    dRow0a, [pSrc], Step1                   ;// 0a
167
168        SMULBB  ACoeff, EightMinusdx, EightMinusdy
169        SMULBB  BCoeff, dx, EightMinusdy
170        VLD1    dRow0b, [pSrc], SrcStepMinus1           ;// 0b
171        SMULBB  CCoeff, EightMinusdx, dy
172        SMULBB  DCoeff, dx, dy
173
174        VDUP    dACoeff, ACoeff
175        VDUP    dBCoeff, BCoeff
176        VDUP    dCCoeff, CCoeff
177        VDUP    dDCoeff, DCoeff
178
179        LDR     pc, [pTable, iWidth, LSL #1]      ;// Branch to the case based on iWidth
180
181;// Pixel layout:
182;//
183;//   x00 x01 x02
184;//   x10 x11 x12
185;//   x20 x21 x22
186
187;// If fractionl mv is not (0, 0)
188WidthIs8MVIsNotZero
189
190                VLD1   dRow1a, [pSrc], Step1            ;// 1a
191                VMULL  qRow0a, dRow0a, dACoeff
192                VLD1   dRow1b, [pSrc], SrcStepMinus1    ;// 1b
193                VMULL  qRow0b, dRow1a, dACoeff
194                VLD1   dRow2a, [pSrc], Step1            ;// 2a
195                VMLAL  qRow0a, dRow0b, dBCoeff
196                VLD1   dRow2b, [pSrc], SrcStepMinus1    ;// 2b
197                VMULL  qRow1a, dRow2a, dACoeff
198                VMLAL  qRow0b, dRow1b, dBCoeff
199                VLD1   dRow3a, [pSrc], Step1            ;// 3a
200                VMLAL  qRow0a, dRow1a, dCCoeff
201                VMLAL  qRow1a, dRow2b, dBCoeff
202                VMULL  qRow1b, dRow3a, dACoeff
203                VLD1   dRow3b, [pSrc], SrcStepMinus1    ;// 3b
204                VMLAL  qRow0b, dRow2a, dCCoeff
205                VLD1   dRow0a, [pSrc], Step1            ;// 0a
206                VMLAL  qRow1b, dRow3b, dBCoeff
207                VMLAL  qRow1a, dRow3a, dCCoeff
208                VMLAL  qRow0a, dRow1b, dDCoeff
209                VLD1   dRow0b, [pSrc], SrcStepMinus1    ;// 0b
210                VMLAL  qRow1b, dRow0a, dCCoeff
211                VMLAL  qRow0b, dRow2b, dDCoeff
212                VMLAL  qRow1a, dRow3b, dDCoeff
213
214
215                SUBS   iHeight, iHeight, #4
216                VMLAL  qRow1b, dRow0b, dDCoeff
217
218                VQRSHRN dOutRow0, qOutRow0, #6
219                VQRSHRN dOutRow1, qOutRow1, #6
220                VQRSHRN dOutRow2, qOutRow2, #6
221                VST1   dOutRow0U64, [pDst], iDstStep
222                VQRSHRN dOutRow3, qOutRow3, #6
223
224                VST1   dOutRow1U64, [pDst], iDstStep
225                VST1   dOutRow2U64, [pDst], iDstStep
226                VST1   dOutRow3U64, [pDst], iDstStep
227
228
229                BGT     WidthIs8MVIsNotZero
230                MOV     return,  #OMX_Sts_NoErr
231                M_EXIT
232
233WidthIs4MVIsNotZero
234
235                VLD1   dRow1a, [pSrc], Step1
236                VMULL  qRow0a, dRow0a, dACoeff
237                VMULL  qRow0b, dRow1a, dACoeff
238                VLD1   dRow1b, [pSrc], SrcStepMinus1
239                VMLAL  qRow0a, dRow0b, dBCoeff
240                VMLAL  qRow0b, dRow1b, dBCoeff
241                VLD1   dRow0a, [pSrc], Step1
242                VMLAL  qRow0a, dRow1a, dCCoeff
243                VMLAL  qRow0b, dRow0a, dCCoeff
244                VLD1   dRow0b, [pSrc], SrcStepMinus1
245                SUBS   iHeight, iHeight, #2
246                VMLAL  qRow0b, dRow0b, dDCoeff
247                VMLAL  qRow0a, dRow1b, dDCoeff
248
249                VQRSHRN dOutRow1, qOutRow1, #6
250                VQRSHRN dOutRow0, qOutRow0, #6
251
252                VST1   dOutRow0U32[0], [pDst], iDstStep
253                VST1   dOutRow1U32[0], [pDst], iDstStep
254
255                BGT     WidthIs4MVIsNotZero
256                MOV     return,  #OMX_Sts_NoErr
257                M_EXIT
258
259WidthIs2MVIsNotZero
260
261                VLD1   dRow1a, [pSrc], Step1
262                VMULL  qRow0a, dRow0a, dACoeff
263                VMULL  qRow0b, dRow1a, dACoeff
264                VLD1   dRow1b, [pSrc], SrcStepMinus1
265                VMLAL  qRow0a, dRow0b, dBCoeff
266                VMLAL  qRow0b, dRow1b, dBCoeff
267                VLD1   dRow0a, [pSrc], Step1
268                VMLAL  qRow0a, dRow1a, dCCoeff
269                VMLAL  qRow0b, dRow0a, dCCoeff
270                VLD1   dRow0b, [pSrc], SrcStepMinus1
271                SUBS   iHeight, iHeight, #2
272                VMLAL  qRow0b, dRow0b, dDCoeff
273                VMLAL  qRow0a, dRow1b, dDCoeff
274
275                VQRSHRN dOutRow1, qOutRow1, #6
276                VQRSHRN dOutRow0, qOutRow0, #6
277
278                VST1   dOutRow0U16[0], [pDst], iDstStep
279                VST1   dOutRow1U16[0], [pDst], iDstStep
280
281                BGT     WidthIs2MVIsNotZero
282                MOV     return,  #OMX_Sts_NoErr
283                M_EXIT
284
285;// If fractionl mv is (0, 0)
286WidthIs8MVIsZero
287                SUB     pSrc, pSrc, iSrcStep
288
289WidthIs8LoopMVIsZero
290                VLD1    dRow0a, [pSrc], iSrcStep
291                SUBS    iHeight, iHeight, #2
292                VLD1    dRow0b, [pSrc], iSrcStep
293                VST1    dOut0U64, [pDst], iDstStep
294                VST1    dOut1U64, [pDst], iDstStep
295                BGT     WidthIs8LoopMVIsZero
296
297                MOV     return,  #OMX_Sts_NoErr
298                M_EXIT
299
300WidthIs4MVIsZero
301                VLD1    dRow0b, [pSrc], iSrcStep
302
303                SUBS    iHeight, iHeight, #2
304
305                VST1    dOut00U32[0], [pDst], iDstStep
306                VLD1    dRow0a, [pSrc], iSrcStep
307                VST1    dOut01U32[0], [pDst], iDstStep
308
309                BGT     WidthIs4MVIsZero
310                MOV     return,  #OMX_Sts_NoErr
311                M_EXIT
312
313WidthIs2MVIsZero
314                VLD1    dRow0b, [pSrc], iSrcStep
315                SUBS    iHeight, iHeight, #2
316
317                VST1    dOut0U16[0], [pDst], iDstStep
318                VLD1    dRow0a, [pSrc], iSrcStep
319                VST1    dOut1U16[0], [pDst], iDstStep
320
321                BGT     WidthIs2MVIsZero
322                MOV     return,  #OMX_Sts_NoErr
323                M_END
324
325        ENDIF ;// CortexA8
326
327        END
328
329;//-----------------------------------------------------------------------------------------------
330;// armVCM4P10_Interpolate_Chroma_asm ends
331;//-----------------------------------------------------------------------------------------------
332
333