1@/******************************************************************************
2@ *
3@ * Copyright (C) 2015 The Android Open Source Project
4@ *
5@ * Licensed under the Apache License, Version 2.0 (the "License");
6@ * you may not use this file except in compliance with the License.
7@ * You may obtain a copy of the License at:
8@ *
9@ * http://www.apache.org/licenses/LICENSE-2.0
10@ *
11@ * Unless required by applicable law or agreed to in writing, software
12@ * distributed under the License is distributed on an "AS IS" BASIS,
13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@ * See the License for the specific language governing permissions and
15@ * limitations under the License.
16@ *
17@ *****************************************************************************
18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19@*/
20@**
21@*******************************************************************************
22@* @file
23@*  ih264_resi_trans_quant_a9.s
24@*
25@* @brief
26@*  Contains function definitions for residual and forward trans
27@*
28@* @author
29@*  Ittiam
30@*
31@* @par List of Functions:
32@*  ih264_resi_trans_quant_4x4_a9
33@*  ih264_resi_trans_quant_8x8_a9
34@*  ih264_resi_trans_quant_chroma_4x4_a9
35@*  ih264_hadamard_quant_4x4_a9
36@*  ih264_hadamard_quant_2x2_uv_a9
37@*
38@* @remarks
39@*  None
40@*
41@*******************************************************************************
42
43
44.text
45.p2align 2
46@*****************************************************************************
47@*
48@* Function Name     : ih264_resi_trans_quant_4x4_a9
49@* Description       : This function does cf4 of H264
50@*
51@* Arguments         :  R0 :pointer to src buffer
52@                       R1 :pointer to pred buffer
53@                       R2 :pointer to dst buffer
54@                       R3 :source stride
55@                       STACK : pred stride,
56@                               dst stride,
57@                               pointer to scaling matrix,
58@                               pointer to threshold matrix,
59@                               qbits,
60@                               rounding factor,
61@                               pointer to store nnz
62@                               pointer to store non quantized dc value
63@ Values Returned   : NONE
64@
65@ Register Usage    :
66@ Stack Usage       : 40 bytes
67@ Cycles            : Around
68@ Interruptiaility  : Interruptable
69@
70@ Known Limitations
71@   \Assumptions    :
72@
73@ Revision History  :
74@         DD MM YYYY    Author(s)   Changes
75@         1 12 2013    100633      First version
76@         20 1 2014    100633      Changes the API, Optimization
77@
78@*****************************************************************************
79
80    .global ih264_resi_trans_quant_4x4_a9
81ih264_resi_trans_quant_4x4_a9:
82
83    @R0     :pointer to src buffer
84    @R1     :pointer to pred buffer
85    @R2     :pointer to dst buffer
86    @R3     :Source stride
87    @STACk  :pred stride
88    @       :scale matirx,
89    @       :threshold matrix
90    @       :qbits
91    @       :round factor
92    @       :nnz
93
94    push          {r4-r12, lr}          @push all the variables first
95
96    add           r11, sp, #40          @decrement stack pointer,to accomodate two variables
97    ldmfd         r11, {r4-r10}         @load the strides into registers
98
99    @R0     :pointer to src buffer
100    @R1     :pointer to pred buffer
101    @R2     :pointer to dst buffer
102    @R3     :Source stride
103    @R4     :Pred stride
104    @R5     :scale matirx,
105    @R6     :threshold matrix
106    @R7     :qbits
107    @R8     :round factor
108    @R9     :nnz
109
110    vpush         {d8-d15}
111
112    mov           r11, #0
113    sub           r7, r11, r7           @Negate the qbit value for usiing LSL
114
115    @------------Fucntion Loading done----------------;
116
117    vld1.u8       d30, [r0], r3         @load first 8 pix src row 1
118
119    vld1.u8       d31, [r1], r4         @load first 8 pix pred row 1
120
121    vld1.u8       d28, [r0], r3         @load first 8 pix src row 2
122
123    vld1.u8       d29, [r1], r4         @load first 8 pix pred row 2
124
125    vld1.u8       d26, [r0], r3         @load first 8 pix src row 3
126
127    vld1.u8       d27, [r1], r4         @load first 8 pix pred row 3
128    vsubl.u8      q0, d30, d31          @find residue row 1
129
130    vld1.u8       d24, [r0], r3         @load first 8 pix src row 4
131
132    vld1.u8       d25, [r1], r4         @load first 8 pix pred row 4
133    vsubl.u8      q1, d28, d29          @find residue row 2
134
135    vsubl.u8      q2, d26, d27          @find residue row 3
136    vsubl.u8      q3, d24, d25          @find residue row 4
137
138    vtrn.16       d0, d2                @T12
139    vtrn.16       d4, d6                @T23
140    vtrn.32       d0, d4                @T13
141    vtrn.32       d2, d6                @T14
142
143    vadd.s16      d8 , d0, d6           @x0 = x4+x7
144    vadd.s16      d9 , d2, d4           @x1 = x5+x6
145    vsub.s16      d10, d2, d4           @x2 = x5-x6
146    vsub.s16      d11, d0, d6           @x3 = x4-x7
147
148    vshl.s16      d12, d10, #1          @U_SHIFT(x2,1,shft)
149    vshl.s16      d13, d11, #1          @U_SHIFT(x3,1,shft)
150
151    vadd.s16      d14, d8, d9           @x4 = x0 + x1;
152    vsub.s16      d16, d8, d9           @x6 = x0 - x1;
153    vadd.s16      d15, d13, d10         @x5 = U_SHIFT(x3,1,shft) + x2;
154    vsub.s16      d17, d11, d12         @x7 = x3 - U_SHIFT(x2,1,shft);
155
156    @taking transpose again so as to make do vert transform
157    vtrn.16       d14, d15              @T12
158    vtrn.16       d16, d17              @T23
159    vtrn.32       d14, d16              @T13
160    vtrn.32       d15, d17              @T24
161
162    @let us do vertical transform
163    @same code as horiz
164    vadd.s16      d18, d14, d17         @x0 = x4+x7
165    vadd.s16      d19, d15, d16         @x1 = x5+x6
166    vsub.s16      d20, d15, d16         @x2 = x5-x6
167    vsub.s16      d21, d14, d17         @x3 = x4-x7
168
169    vshl.s16      d22, d20, #1          @U_SHIFT(x2,1,shft)
170    vshl.s16      d23, d21, #1          @U_SHIFT(x3,1,shft)
171
172    vdup.s32      q4, r8                @Load rounding value row 1
173
174    vadd.s16      d24, d18, d19         @x5 = x0 + x1;
175    vsub.s16      d26, d18, d19         @x7 = x0 - x1;
176    vadd.s16      d25, d23, d20         @x6 = U_SHIFT(x3,1,shft) + x2;
177    vsub.s16      d27, d21, d22         @x8 = x3 - U_SHIFT(x2,1,shft);
178    vdup.s32      q10, r7               @Load qbit values
179
180    vst1.s16      d24[0], [r10]         @Store the dc value to alternate dc sddress
181
182@core tranform is done for 4x8 block 1
183    vld1.s16      {q14-q15}, [r5]       @load the scaling values
184
185    vabs.s16      q0, q12               @Abs val of row 1 blk 1
186
187    vabs.s16      q1, q13               @Abs val of row 2 blk 1
188
189    vmov.s32      q5, q4                @copy round fact for row 2
190
191    vmov.s32      q6, q4                @copy round fact for row 2
192    vclt.s16      q2, q12, #0           @Get the sign of row 1 blk 1
193
194    vmov.s32      q7, q4                @copy round fact for row 2
195    vclt.s16      q3, q13, #0           @Get the sign of row 2 blk 1
196
197    vmlal.s16     q4, d0, d28           @Multiply and add row 1
198    vmlal.s16     q5, d1, d29           @Multiply and add row 2
199    vmlal.s16     q6, d2, d30           @Multiply and add row 3
200    vmlal.s16     q7, d3, d31           @Multiply and add row 4
201
202    vshl.s32      q11, q4, q10          @Shift row 1
203    vshl.s32      q12, q5, q10          @Shift row 2
204    vshl.s32      q13, q6, q10          @Shift row 3
205    vshl.s32      q14, q7, q10          @Shift row 4
206
207    vmovn.s32     d30, q11              @Narrow row 1
208    vmovn.s32     d31, q12              @Narrow row 2
209    vmovn.s32     d0 , q13              @Narrow row 3
210    vmovn.s32     d1 , q14              @Narrow row 4
211
212    vneg.s16      q1, q15               @Get negative
213    vneg.s16      q4, q0                @Get negative
214
215    vceq.s16      q5, q15, #0           @I  compare with zero row 1 and 2 blk 1
216    vceq.s16      q6, q0 , #0           @I  compare with zero row 1 and 2 blk 1
217
218    vbsl.s16      q2, q1, q15           @Restore sign of row 1 and 2
219    vbsl.s16      q3, q4, q0            @Restore sign of row 3 and 4
220
221
222    vmovn.u16     d14, q5               @I  Narrow the comparison for row 1 and 2 blk 1
223    vmovn.u16     d15, q6               @I  Narrow the comparison for row 1 and 2 blk 2
224
225    vshr.u8       q8, q7, #7            @I  Reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
226
227    vpadd.u8      d18, d16, d17         @I pair add nnz 1
228    vpadd.u8      d20, d18, d19         @I Pair add nnz 2
229    vpadd.u8      d22, d20, d21         @I Pair add nnz 3
230    vpadd.u8      d24, d22, d23         @I Pair add nnz4
231    vst1.s16      {q2-q3}, [r2]         @Store blk
232
233    vmov.u8       d25, #16              @I Get max nnz
234    vsub.u8       d26, d25, d24         @I invert current nnz
235
236    vst1.u8       d26[0], [r9]          @I  Write nnz
237
238    vpop          {d8-d15}
239    pop           {r4-r12, pc}
240
241
242
243@*****************************************************************************
244@*
245@* Function Name     : ih264_resi_trans_quant_chroma_4x4_a9
246@* Description       : This function does residue calculation, forward transform
247@*                     and quantization for 4x4 chroma block.
248@*
249@* Arguments         :  R0 :pointer to src buffer
250@                       R1 :pointer to pred buffer
251@                       R2 :pointer to dst buffer
252@                       R3 :source stride
253@                       STACK : pred stride,
254@                               dst stride,
255@                               pointer to scaling matrix,
256@                               pointer to threshold matrix,
257@                               qbits,
258@                               rounding factor,
259@                               pointer to store nnz
260@                               pointer to store unquantized dc values
261@ Values Returned   : NONE
262@
263@ Register Usage    :
264@ Stack Usage       : 40 bytes
265@ Cycles            : Around
266@ Interruptiaility  : Interruptable
267@
268@ Known Limitations
269@   \Assumptions    :
270@
271@ Revision History  :
272@         DD MM YYYY    Author(s)   Changes
273@         11 2 2015    100664      First version
274@
275@*****************************************************************************
276
277    .global ih264_resi_trans_quant_chroma_4x4_a9
278ih264_resi_trans_quant_chroma_4x4_a9:
279
280    @R0     :pointer to src buffer
281    @R1     :pointer to pred buffer
282    @R2     :pointer to dst buffer
283    @R3     :Source stride
284    @STACk  :pred stride
285    @       :scale matirx,
286    @       :threshold matrix
287    @       :qbits
288    @       :round factor
289    @       :nnz
290    @       :pu1_dc_alt_addr
291    push          {r4-r12, lr}          @push all the variables first
292
293    add           r11, sp, #40          @decrement stack pointer,to accomodate two variables
294    ldmfd         r11, {r4-r10}         @load the strides into registers
295
296    @R0     :pointer to src buffer
297    @R1     :pointer to pred buffer
298    @R2     :pointer to dst buffer
299    @R3     :Source stride
300    @R4     :Pred stride
301    @R5     :scale matirx,
302    @R6     :threshold matrix
303    @R7     :qbits
304    @R8     :round factor
305    @R9     :nnz
306    vpush         {d8-d15}
307    mov           r11, #0
308    sub           r7, r11, r7           @Negate the qbit value for usiing LSL
309
310    @------------Fucntion Loading done----------------;
311
312    vld2.u8       {d10, d11}, [r0], r3  @load first 8 pix src row 1
313
314    vld2.u8       {d11, d12}, [r1], r4  @load first 8 pix pred row 1
315
316    vld2.u8       {d28, d29}, [r0], r3  @load first 8 pix src row 2
317
318    vld2.u8       {d29, d30}, [r1], r4  @load first 8 pix pred row 2
319
320    vld2.u8       {d25, d26}, [r0], r3  @load first 8 pix src row 3
321
322    vld2.u8       {d26, d27}, [r1], r4  @load first 8 pix pred row 3
323    vsubl.u8      q0, d10, d11          @find residue row 1
324
325    vld2.u8       {d22, d23}, [r0], r3  @load first 8 pix src row 4
326
327    vld2.u8       {d23, d24}, [r1], r4  @load first 8 pix pred row 4
328    vsubl.u8      q1, d28, d29          @find residue row 2
329
330    vsubl.u8      q2, d25, d26          @find residue row 3
331    vsubl.u8      q3, d22, d23          @find residue row 4
332
333    vtrn.16       d0, d2                @T12
334    vtrn.16       d4, d6                @T23
335    vtrn.32       d0, d4                @T13
336    vtrn.32       d2, d6                @T14
337
338    vadd.s16      d8 , d0, d6           @x0 = x4+x7
339    vadd.s16      d9 , d2, d4           @x1 = x5+x6
340    vsub.s16      d10, d2, d4           @x2 = x5-x6
341    vsub.s16      d11, d0, d6           @x3 = x4-x7
342
343    vshl.s16      d12, d10, #1          @U_SHIFT(x2,1,shft)
344    vshl.s16      d13, d11, #1          @U_SHIFT(x3,1,shft)
345
346    vadd.s16      d14, d8, d9           @x4 = x0 + x1;
347    vsub.s16      d16, d8, d9           @x6 = x0 - x1;
348    vadd.s16      d15, d13, d10         @x5 = U_SHIFT(x3,1,shft) + x2;
349    vsub.s16      d17, d11, d12         @x7 = x3 - U_SHIFT(x2,1,shft);
350
351    @taking transpose again so as to make do vert transform
352    vtrn.16       d14, d15              @T12
353    vtrn.16       d16, d17              @T23
354    vtrn.32       d14, d16              @T13
355    vtrn.32       d15, d17              @T24
356
357    @let us do vertical transform
358    @same code as horiz
359    vadd.s16      d18, d14, d17         @x0 = x4+x7
360    vadd.s16      d19, d15, d16         @x1 = x5+x6
361    vsub.s16      d20, d15, d16         @x2 = x5-x6
362    vsub.s16      d21, d14, d17         @x3 = x4-x7
363
364    vshl.s16      d22, d20, #1          @U_SHIFT(x2,1,shft)
365    vshl.s16      d23, d21, #1          @U_SHIFT(x3,1,shft)
366
367    vdup.s32      q4, r8                @Load rounding value row 1
368
369    vadd.s16      d24, d18, d19         @x5 = x0 + x1;
370    vsub.s16      d26, d18, d19         @x7 = x0 - x1;
371    vadd.s16      d25, d23, d20         @x6 = U_SHIFT(x3,1,shft) + x2;
372    vsub.s16      d27, d21, d22         @x8 = x3 - U_SHIFT(x2,1,shft);
373    vdup.s32      q10, r7               @Load qbit values
374
375    vst1.s16      d24[0], [r10]         @Store Unquantized dc value to dc alte address
376
377@core tranform is done for 4x8 block 1
378    vld1.s16      {q14-q15}, [r5]       @load the scaling values
379
380    vabs.s16      q0, q12               @Abs val of row 1 blk 1
381
382    vabs.s16      q1, q13               @Abs val of row 2 blk 1
383
384    vmov.s32      q5, q4                @copy round fact for row 2
385
386    vmov.s32      q6, q4                @copy round fact for row 2
387    vclt.s16      q2, q12, #0           @Get the sign of row 1 blk 1
388
389    vmov.s32      q7, q4                @copy round fact for row 2
390    vclt.s16      q3, q13, #0           @Get the sign of row 2 blk 1
391
392    vmlal.s16     q4, d0, d28           @Multiply and add row 1
393    vmlal.s16     q5, d1, d29           @Multiply and add row 2
394    vmlal.s16     q6, d2, d30           @Multiply and add row 3
395    vmlal.s16     q7, d3, d31           @Multiply and add row 4
396
397    vshl.s32      q11, q4, q10          @Shift row 1
398    vshl.s32      q12, q5, q10          @Shift row 2
399    vshl.s32      q13, q6, q10          @Shift row 3
400    vshl.s32      q14, q7, q10          @Shift row 4
401
402    vmovn.s32     d30, q11              @Narrow row 1
403    vmovn.s32     d31, q12              @Narrow row 2
404    vmovn.s32     d0 , q13              @Narrow row 3
405    vmovn.s32     d1 , q14              @Narrow row 4
406
407    vneg.s16      q1, q15               @Get negative
408    vneg.s16      q4, q0                @Get negative
409
410    vceq.s16      q5, q15, #0           @I  compare with zero row 1 and 2 blk 1
411    vceq.s16      q6, q0 , #0           @I  compare with zero row 1 and 2 blk 1
412
413    vbsl.s16      q2, q1, q15           @Restore sign of row 1 and 2
414    vbsl.s16      q3, q4, q0            @Restore sign of row 3 and 4
415
416    vmovn.u16     d14, q5               @I  Narrow the comparison for row 1 and 2 blk 1
417    vmovn.u16     d15, q6               @I  Narrow the comparison for row 1 and 2 blk 2
418
419    vshr.u8       q8, q7, #7            @I  Reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
420
421    vpadd.u8      d18, d16, d17         @I pair add nnz 1
422    vpadd.u8      d20, d18, d19         @I Pair add nnz 2
423    vpadd.u8      d22, d20, d21         @I Pair add nnz 3
424    vpadd.u8      d24, d22, d23         @I Pair add nnz4
425    vst1.s16      {q2-q3}, [r2]         @Store blk
426
427    vmov.u8       d25, #16              @I Get max nnz
428    vsub.u8       d26, d25, d24         @I invert current nnz
429
430    vst1.u8       d26[0], [r9]          @I  Write nnz
431
432    vpop          {d8-d15}
433    pop           {r4-r12, pc}
434
435
436
437@*****************************************************************************
438@*
439@* Function Name     : ih264_hadamard_quant_4x4_a9
440@* Description       : This function does forward hadamard transform and
441@*                     quantization for luma dc block
442@*
443@* Arguments         :  R0 :pointer to src buffer
444@                       R1 :pointer to dst buffer
445@                       R2 :pu2_scale_matrix
446@                       R2 :pu2_threshold_matrix
447@                       STACk : u4_qbits
448@                               u4_round_factor
449@                               pu1_nnz
450@ Values Returned   : NONE
451@
452@ Register Usage    :
453@ Stack Usage       : 0 bytes
454@ Cycles            : Around
455@ Interruptiaility  : Interruptable
456@
457@ Known Limitations
458@   \Assumptions    :
459@
460@ Revision History  :
461@         DD MM YYYY    Author(s)   Changes
462@         20 2 2015    100633      First version
463@
464@*****************************************************************************
465@ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
466@                           const UWORD16 *pu2_scale_matrix,
467@                           const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
468@                           UWORD32 u4_round_factor,UWORD8  *pu1_nnz
469@                           )
470    .global ih264_hadamard_quant_4x4_a9
471ih264_hadamard_quant_4x4_a9:
472
473@Registert usage
474@   r0 : src
475@   r1 : dst
476@   r2 : *pu2_scale_matrix
477@   r3 : *pu2_threshold_matrix
478
479    vld4.s16      {d0, d1, d2, d3}, [r0]! @Load 4x4 block
480    vpush         {d8-d15}
481
482    vld1.u16      d30[0], [r2]          @load pu2_scale_matrix[0]
483
484    vaddl.s16     q3, d0, d3            @x0 = x4 + x7;
485    vaddl.s16     q4, d1, d2            @x1 = x5 + x6;
486    vsubl.s16     q5, d1, d2            @x2 = x5 - x6;
487    vsubl.s16     q6, d0, d3            @x3 = x4 - x7;
488
489    vdup.u16      d30, d30[0]           @pu2_scale_matrix[0]
490
491    vadd.s32      q7, q3, q4            @pi2_dst[0] = x0 + x1;
492    vadd.s32      q8, q6, q5            @pi2_dst[1] = x3 + x2;
493    add           r3, sp, #68           @Get address of u4_round_factor
494    vsub.s32      q9, q3, q4            @pi2_dst[2] = x0 - x1;
495    vsub.s32      q10, q6, q5           @pi2_dst[3] = x3 - x2;
496
497    vtrn.s32      q7, q8                @transpose 4x4 block
498    vtrn.s32      q9, q10
499    vld1.s32      d0[0], [r3]           @load   u4_round_factor
500    vswp          d15, d18
501    vswp          d17, d20
502
503    add           r3, sp, #64           @Get address of u4_qbits
504    vadd.s32      q11, q7, q10          @x0 = x4 + x7;
505    vadd.s32      q12, q8, q9           @x1 = x5 + x6;
506    vld1.s32      d31[0], [r3]          @load  u4_qbits
507    vsub.s32      q13, q8, q9           @x2 = x5 - x6;
508    vsub.s32      q14, q7, q10          @x3 = x4 - x7;
509
510    vdup.s32      q7, d0[0]             @u4_round_factor
511
512    vadd.s32      q0, q11, q12          @(x0 + x1)
513    vadd.s32      q1, q14, q13          @(x3 + x2)
514    vsub.s32      q2, q11, q12          @(x0 - x1)
515    vsub.s32      q3, q14, q13          @(x3 - x2)
516
517    vdup.s32      q11, d31[0]           @u4_round_factor
518
519    vshrn.s32     d0, q0, #1            @i4_value = (x0 + x1) >> 1;
520    vshrn.s32     d1, q1, #1            @i4_value = (x3 + x2) >> 1;
521    vshrn.s32     d2, q2, #1            @i4_value = (x0 - x1) >> 1;
522    vshrn.s32     d3, q3, #1            @i4_value = (x3 - x2) >> 1;
523
524    vabs.s16      q5, q0
525    vabs.s16      q6, q1
526
527    vmov.s32      q8, q7                @Get the round fact
528    vmov.s32      q9, q7
529    vmov.s32      q10, q7
530
531    vclt.s16      q3, q0, #0            @get the sign row 1,2
532    vclt.s16      q4, q1, #0
533
534    vneg.s32      q11, q11              @-u4_round_factor
535
536    vmlal.u16     q7, d10, d30
537    vmlal.u16     q8, d11, d30
538    vmlal.u16     q9, d12, d30
539    vmlal.u16     q10, d13, d30
540
541    vshl.u32      q7, q7, q11
542    vshl.u32      q8, q8, q11
543    vshl.u32      q9, q9, q11
544    vshl.u32      q10, q10, q11
545
546    vqmovn.u32    d22, q7
547    vqmovn.u32    d23, q8
548    vqmovn.u32    d24, q9
549    vqmovn.u32    d25, q10
550
551    vneg.s16      q13, q11
552    vneg.s16      q14, q12
553
554    vbsl.s16      q3, q13, q11
555    vbsl.s16      q4, q14, q12
556
557    vceq.s16      q5, q11, #0
558    vceq.s16      q6, q12, #0
559
560    vst1.s16      {q3}, [r1]!
561
562    vshrn.u16     d14, q5, #8
563    vshrn.u16     d15, q6, #8
564
565    ldr           r3, [sp, #72]         @Load *pu1_nnz
566
567    vshr.u8       q7, q7, #7
568
569    vst1.s16      {q4}, [r1]!
570
571    vadd.u8       d16, d14, d15
572    vmov.u8       d20, #16
573    vpadd.u8      d17, d16, d16
574    vpadd.u8      d18, d17, d17
575    vpadd.u8      d19, d18, d18
576    vsub.u8       d20, d20, d19
577    vst1.u8       d20[0], [r3]
578
579    vpop          {d8-d15}
580    bx            lr
581
582
583
584
585@*****************************************************************************
586@*
587@* Function Name     : ih264_hadamard_quant_2x2_uv_a9
588@* Description       : This function does forward hadamard transform and
589@*                     quantization for dc block of chroma for both planes
590@*
591@* Arguments         :  R0 :pointer to src buffer
592@                       R1 :pointer to dst buffer
593@                       R2 :pu2_scale_matrix
594@                       R2 :pu2_threshold_matrix
595@                       STACk : u4_qbits
596@                               u4_round_factor
597@                               pu1_nnz
598@ Values Returned   : NONE
599@
600@ Register Usage    :
601@ Stack Usage       : 0 bytes
602@ Cycles            : Around
603@ Interruptiaility  : Interruptable
604@
605@ Known Limitations
606@   \Assumptions    :
607@
608@ Revision History  :
609@         DD MM YYYY    Author(s)   Changes
610@         20 2 2015    100633      First version
611@
612@*****************************************************************************
613@ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
614@                             const UWORD16 *pu2_scale_matrix,
615@                             const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
616@                             UWORD32 u4_round_factor,UWORD8  *pu1_nnz
617@                             )
618
619    .global ih264_hadamard_quant_2x2_uv_a9
620ih264_hadamard_quant_2x2_uv_a9:
621
622    vpush         {d8-d15}
623    vld2.s16      {d0-d1}, [r0]         @load src
624
625    add           r3, sp, #68           @Get address of u4_round_factor
626
627    vaddl.s16     q3, d0, d1            @x0 = x4 + x5;, x2 = x6 + x7;
628    vld1.u16      d30[0], [r2]          @load pu2_scale_matrix[0]
629    vsubl.s16     q4, d0, d1            @x1 = x4 - x5;  x3 = x6 - x7;
630
631    add           r0, sp, #64           @Get affress of u4_qbits
632    vld1.s32      d28[0], [r3]          @load   u4_round_factor
633    vtrn.s32      q3, q4                @q1 -> x0 x1, q2 -> x2 x3
634
635    vadd.s32      q0, q3, q4            @ (x0 + x2) (x1 + x3)  (y0 + y2); (y1 + y3);
636    vld1.s32      d24[0], [r0]          @load  u4_qbits
637    vsub.s32      q1, q3, q4            @ (x0 - x2) (x1 - x3)  (y0 - y2); (y1 - y3);
638
639    vdup.u16      d30, d30[0]           @pu2_scale_matrix
640
641    vabs.s32      q2, q0
642    vabs.s32      q3, q1
643
644    vdup.s32      q14, d28[0]           @u4_round_factor
645
646    vmovl.u16     q15, d30              @pu2_scale_matrix
647
648    vclt.s32      q4, q0, #0            @get the sign row 1,2
649    vdup.s32      q12, d24[0]           @u4_round_factor
650    vclt.s32      q5, q1, #0
651
652    vqmovn.u32    d8, q4
653    vqmovn.s32    d9, q5
654
655    vmov.s32      q13, q14              @Get the round fact
656    vneg.s32      q12, q12              @-u4_round_factor
657
658    vmla.u32      q13, q2, q15
659    vmla.u32      q14, q3, q15
660
661    vshl.u32      q13, q13, q12         @>>qbit
662    vshl.u32      q14, q14, q12         @>>qbit
663
664    vqmovn.u32    d10, q13
665    vqmovn.u32    d11, q14
666
667    vneg.s16      q6, q5
668
669    vbsl.s16      q4, q6, q5            @*sign
670
671    vtrn.s32      d8, d9
672
673    vceq.s16      q7, q4, #0            @Compute nnz
674
675    vshrn.u16     d14, q7, #8           @reduce nnz comparison to 1 bit
676
677    ldr           r3, [sp, #72]         @Load *pu1_nnz
678    vshr.u8       d14, d14, #7          @reduce nnz comparison to 1 bit
679    vmov.u8       d20, #4               @Since we add zeros, we need to subtract from 4 to get nnz
680    vpadd.u8      d17, d14, d14         @Sum up nnz
681
682    vst1.s16      {q4}, [r1]!           @Store the block
683
684    vpadd.u8      d17, d17, d17         @Sum up nnz
685    vsub.u8       d20, d20, d17         @4- numzeros
686    vst1.u16      d20[0], [r3]          @store nnz
687
688    vpop          {d8-d15}
689    bx            lr
690
691
692
693
694
695