1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///*****************************************************************************/
21///**
22//*******************************************************************************
23//* @file
24//*  ih264_resi_trans_quant_av8.c
25//*
26//* @brief
27//*  contains function definitions for residual and forward trans
28//*
29//* @author
30//*  ittiam
31//*
32//* @par list of functions:
33//*    ih264_resi_trans_quant_4x4_av8
34//*    ih264_resi_trans_quant_8x8_av8
35//*    ih264_resi_trans_quant_chroma_4x4_av8
36//* @remarks
37//*  none
38//*
39//*******************************************************************************
40.include "ih264_neon_macros.s"
41.text
42.p2align 2
43//*****************************************************************************
44//*
45//* function name     : ih264_resi_trans_quant_4x4
46//* description       : this function does cf4 of h264
47//*
48// values returned   : none
49//
50// register usage    :
51// stack usage       : 64 bytes
52// cycles            :
53// interruptiaility  : interruptable
54//
55// known limitations
56//   \assumptions    :
57//
58// revision history  :
59//         dd mm yyyy    author(s)   changes
60//         1 12 2013    100633      first version
61//         20 1 2014    100633      changes the api, optimization
62//
63//*****************************************************************************
64
65    .global ih264_resi_trans_quant_4x4_av8
66ih264_resi_trans_quant_4x4_av8:
67
68    push_v_regs
69    //x0     :pointer to src buffer
70    //x1     :pointer to pred buffer
71    //x2     :pointer to dst buffer
72    //w3     :source stride
73    //w4     :pred stride
74    //w5     :scale matirx,
75    //x6     :threshold matrix
76    //w7     :qbits
77    //w8        :round factor
78    //x9        :nnz
79    //x10       :pointer to store non quantized dc value
80
81    sxtw      x3, w3
82    sxtw      x4, w4
83    ldr       w8, [sp, #64]             //load round factor
84    ldr       x10, [sp, #80]            //load addres for non quant val
85    neg       w7, w7                    //negate the qbit value for usiing lsl
86    ldr       x9, [sp, #72]
87
88    //------------fucntion loading done----------------;
89
90    ld1       {v30.8b}, [x0], x3        //load first 8 pix src  row 1
91    ld1       {v31.8b}, [x1], x4        //load first 8 pix pred row 1
92    ld1       {v28.8b}, [x0], x3        //load first 8 pix src  row 2
93    ld1       {v29.8b}, [x1], x4        //load first 8 pix pred row 2
94    ld1       {v26.8b}, [x0], x3        //load first 8 pix src  row 3
95    ld1       {v27.8b}, [x1], x4        //load first 8 pix pred row 3
96    ld1       {v24.8b}, [x0]            //load first 8 pix src row 4
97    ld1       {v25.8b}, [x1]            //load first 8 pix pred row 4
98
99    usubl     v0.8h, v30.8b, v31.8b     //find residue row 1
100    usubl     v2.8h, v28.8b, v29.8b     //find residue row 2
101    usubl     v4.8h, v26.8b, v27.8b     //find residue row 3
102    usubl     v6.8h, v24.8b, v25.8b     //find residue row 4
103
104    trn1      v1.4h, v0.4h, v2.4h
105    trn2      v3.4h, v0.4h, v2.4h       //t12
106    trn1      v5.4h, v4.4h, v6.4h
107    trn2      v7.4h, v4.4h, v6.4h       //t23
108
109    trn1      v0.2s, v1.2s, v5.2s
110    trn2      v4.2s, v1.2s, v5.2s       //t13
111    trn1      v2.2s, v3.2s, v7.2s
112    trn2      v6.2s, v3.2s, v7.2s       //t14
113
114    add       v8.4h, v0.4h, v6.4h       //x0 = x4+x7
115    add       v9.4h, v2.4h, v4.4h       //x1 = x5+x6
116    sub       v10.4h, v2.4h, v4.4h      //x2 = x5-x6
117    sub       v11.4h, v0.4h, v6.4h      //x3 = x4-x7
118
119    shl       v12.4h, v10.4h, #1        //u_shift(x2,1,shft)
120    shl       v13.4h, v11.4h, #1        //u_shift(x3,1,shft)
121
122    add       v14.4h, v8.4h, v9.4h      //x4 = x0 + x1;
123    sub       v16.4h, v8.4h, v9.4h      //x6 = x0 - x1;
124    add       v15.4h, v13.4h, v10.4h    //x5 = u_shift(x3,1,shft) + x2;
125    sub       v17.4h, v11.4h, v12.4h    //x7 = x3 - u_shift(x2,1,shft);
126
127    //taking transpose again so as to make do vert transform
128    trn1      v0.4h, v14.4h, v15.4h
129    trn2      v1.4h, v14.4h, v15.4h     //t12
130    trn1      v2.4h, v16.4h, v17.4h
131    trn2      v3.4h, v16.4h, v17.4h     //t23
132
133    trn1      v14.2s, v0.2s, v2.2s
134    trn2      v16.2s, v0.2s, v2.2s      //t13
135    trn1      v15.2s, v1.2s, v3.2s
136    trn2      v17.2s, v1.2s, v3.2s      //t24
137
138    //let us do vertical transform
139    //same code as horiz
140    add       v18.4h, v14.4h , v17.4h   //x0 = x4+x7
141    add       v19.4h, v15.4h , v16.4h   //x1 = x5+x6
142    sub       v20.4h, v15.4h , v16.4h   //x2 = x5-x6
143    sub       v21.4h, v14.4h , v17.4h   //x3 = x4-x7
144
145    shl       v22.4h, v20.4h, #1        //u_shift(x2,1,shft)
146    shl       v23.4h, v21.4h, #1        //u_shift(x3,1,shft)
147
148    dup       v8.4s, w8                 //load rounding value row 1
149
150    add       v24.4h, v18.4h , v19.4h   //x5 = x0 + x1;
151    sub       v26.4h, v18.4h , v19.4h   //x7 = x0 - x1;
152    add       v25.4h, v23.4h , v20.4h   //x6 = u_shift(x3,1,shft) + x2;
153    sub       v27.4h, v21.4h , v22.4h   //x8 = x3 - u_shift(x2,1,shft);
154
155    dup       v23.4s, w8                //load round factor values
156
157    st1       {v24.h}[0], [x10]         //store the dc value to alternate dc sddress
158//core tranform is done for 4x8 block 1
159    ld1       {v28.4h-v31.4h}, [x5]     //load the scaling values
160
161    abs       v0.4h, v24.4h             //abs val of row 1
162    abs       v1.4h, v25.4h             //abs val of row 2
163    abs       v2.4h, v26.4h             //abs val of row 3
164    abs       v3.4h, v27.4h             //abs val of row 4
165
166    cmgt      v4.4h, v24.4h, #0
167    cmgt      v5.4h, v25.4h, #0
168    cmgt      v6.4h, v26.4h, #0
169    cmgt      v7.4h, v27.4h, #0
170
171    smull     v0.4s, v0.4h, v28.4h      //multiply and add row 1
172    smull     v1.4s, v1.4h, v29.4h      //multiply and add row 2
173    smull     v2.4s, v2.4h, v30.4h      //multiply and add row 3
174    smull     v3.4s, v3.4h, v31.4h      //multiply and add row 4
175
176    add       v20.4s, v0.4s, v23.4s
177    add       v21.4s, v1.4s, v23.4s
178    add       v22.4s, v2.4s, v23.4s
179    add       v23.4s, v3.4s, v23.4s
180
181    dup       v24.4s, w7
182
183    sshl      v20.4s, v20.4s, v24.4s    //shift row 1
184    sshl      v21.4s, v21.4s, v24.4s    //shift row 2
185    sshl      v22.4s, v22.4s, v24.4s    //shift row 3
186    sshl      v23.4s, v23.4s, v24.4s    //shift row 4
187
188    xtn       v20.4h, v20.4s            //narrow row 1
189    xtn       v21.4h, v21.4s            //narrow row 2
190    xtn       v22.4h, v22.4s            //narrow row 3
191    xtn       v23.4h, v23.4s            //narrow row 4
192
193    neg       v24.8h, v20.8h            //get negative
194    neg       v25.8h, v21.8h            //get negative
195    neg       v26.8h, v22.8h            //get negative
196    neg       v27.8h, v23.8h            //get negative
197
198    //compare with zero for computng nnz
199    cmeq      v0.4h, v20.4h, #0
200    cmeq      v1.4h, v21.4h, #0
201    cmeq      v2.4h, v22.4h, #0
202    cmeq      v3.4h, v23.4h, #0
203
204    bsl       v4.8b, v20.8b, v24.8b     //restore sign of row 1 and 2
205    bsl       v5.8b, v21.8b, v25.8b     //restore sign of row 3 and 4
206    bsl       v6.8b, v22.8b, v26.8b     //restore sign of row 1 and 2
207    bsl       v7.8b, v23.8b, v27.8b     //restore sign of row 3 and 4
208
209    //narrow the comaprison result
210    mov       v0.d[1], v2.d[0]
211    mov       v1.d[1], v3.d[0]
212
213    xtn       v0.8b, v0.8h
214    xtn       v1.8b, v1.8h
215
216    ushr      v0.8b, v0.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
217    ushr      v1.8b, v1.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
218
219    add       v0.8b, v0.8b, v1.8b       //i pair add nnz 1
220    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
221    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
222    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
223
224    st1       {v4.4h-v7.4h}, [x2]       //store blk
225
226    movi      v25.8b, #16               //get max nnz
227    sub       v26.8b, v25.8b , v0.8b    //invert current nnz
228    st1       {v26.b}[0], [x9]          //write nnz
229
230    pop_v_regs
231    ret
232
233
234//*****************************************************************************
235//*
236//* function name     : ih264_resi_trans_quant_chroma_4x4
237//* description       : this function does residue calculation, forward transform
238//*                        and quantization for 4x4 chroma block.
239//*
240// values returned   : none
241//
242// register usage    :
243// stack usage       : 64 bytes
244// cycles            :
245// interruptiaility  : interruptable
246//
247// known limitations
248//   \assumptions    :
249//
250// revision history  :
251//         dd mm yyyy    author(s)   changes
252//         11 2 2015    100664      first version
253//         25 2 2015    100633      first av8 version
254//*****************************************************************************
255
256    .global ih264_resi_trans_quant_chroma_4x4_av8
257ih264_resi_trans_quant_chroma_4x4_av8:
258
259    push_v_regs
260    //x0     :pointer to src buffer
261    //x1     :pointer to pred buffer
262    //x2     :pointer to dst buffer
263    //w3     :source stride
264    //w4     :pred stride
265    //x5     :scale matirx,
266    //x6     :threshold matrix
267    //w7     :qbits
268    //w8        :round factor
269    //x9        :nnz
270    //x10       :pointer to store non quantized dc value
271
272    sxtw      x3, w3
273    sxtw      x4, w4
274    ldr       w8, [sp, #64]             //load round factor
275    ldr       x10, [sp, #80]            //load addres for non quant val
276    neg       w7, w7                    //negate the qbit value for usiing lsl
277    ldr       x9, [sp, #72]
278    //------------fucntion loading done----------------;
279
280    ld1       {v30.8b}, [x0], x3        //load first 8 pix src  row 1
281    ld1       {v31.8b}, [x1], x4        //load first 8 pix pred row 1
282    ld1       {v28.8b}, [x0], x3        //load first 8 pix src  row 2
283    ld1       {v29.8b}, [x1], x4        //load first 8 pix pred row 2
284    ld1       {v26.8b}, [x0], x3        //load first 8 pix src  row 3
285    ld1       {v27.8b}, [x1], x4        //load first 8 pix pred row 3
286    ld1       {v24.8b}, [x0]            //load first 8 pix src row 4
287    ld1       {v25.8b}, [x1]            //load first 8 pix pred row 4
288
289
290    //deinterleave the loaded values
291    uzp1      v30.8b, v30.8b, v30.8b
292    uzp1      v31.8b, v31.8b, v31.8b
293    uzp1      v28.8b, v28.8b, v28.8b
294    uzp1      v29.8b, v29.8b, v29.8b
295    uzp1      v26.8b, v26.8b, v26.8b
296    uzp1      v27.8b, v27.8b, v27.8b
297    uzp1      v24.8b, v24.8b, v24.8b
298    uzp1      v25.8b, v25.8b, v25.8b
299    //this deinterleaving is the only differnece betweenchrom and luma fucntions
300
301    usubl     v0.8h, v30.8b, v31.8b     //find residue row 1
302    usubl     v2.8h, v28.8b, v29.8b     //find residue row 2
303    usubl     v4.8h, v26.8b, v27.8b     //find residue row 3
304    usubl     v6.8h, v24.8b, v25.8b     //find residue row 4
305
306    trn1      v1.4h, v0.4h, v2.4h
307    trn2      v3.4h, v0.4h, v2.4h       //t12
308    trn1      v5.4h, v4.4h, v6.4h
309    trn2      v7.4h, v4.4h, v6.4h       //t23
310
311    trn1      v0.2s, v1.2s, v5.2s
312    trn2      v4.2s, v1.2s, v5.2s       //t13
313    trn1      v2.2s, v3.2s, v7.2s
314    trn2      v6.2s, v3.2s, v7.2s       //t14
315
316    add       v8.4h, v0.4h, v6.4h       //x0 = x4+x7
317    add       v9.4h, v2.4h, v4.4h       //x1 = x5+x6
318    sub       v10.4h, v2.4h, v4.4h      //x2 = x5-x6
319    sub       v11.4h, v0.4h, v6.4h      //x3 = x4-x7
320
321    shl       v12.4h, v10.4h, #1        //u_shift(x2,1,shft)
322    shl       v13.4h, v11.4h, #1        //u_shift(x3,1,shft)
323
324    add       v14.4h, v8.4h, v9.4h      //x4 = x0 + x1;
325    sub       v16.4h, v8.4h, v9.4h      //x6 = x0 - x1;
326    add       v15.4h, v13.4h, v10.4h    //x5 = u_shift(x3,1,shft) + x2;
327    sub       v17.4h, v11.4h, v12.4h    //x7 = x3 - u_shift(x2,1,shft);
328
329    //taking transpose again so as to make do vert transform
330    trn1      v0.4h, v14.4h, v15.4h
331    trn2      v1.4h, v14.4h, v15.4h     //t12
332    trn1      v2.4h, v16.4h, v17.4h
333    trn2      v3.4h, v16.4h, v17.4h     //t23
334
335    trn1      v14.2s, v0.2s, v2.2s
336    trn2      v16.2s, v0.2s, v2.2s      //t13
337    trn1      v15.2s, v1.2s, v3.2s
338    trn2      v17.2s, v1.2s, v3.2s      //t24
339
340    //let us do vertical transform
341    //same code as horiz
342    add       v18.4h, v14.4h , v17.4h   //x0 = x4+x7
343    add       v19.4h, v15.4h , v16.4h   //x1 = x5+x6
344    sub       v20.4h, v15.4h , v16.4h   //x2 = x5-x6
345    sub       v21.4h, v14.4h , v17.4h   //x3 = x4-x7
346
347    shl       v22.4h, v20.4h, #1        //u_shift(x2,1,shft)
348    shl       v23.4h, v21.4h, #1        //u_shift(x3,1,shft)
349
350    dup       v8.4s, w8                 //load rounding value row 1
351
352    add       v24.4h, v18.4h , v19.4h   //x5 = x0 + x1;
353    sub       v26.4h, v18.4h , v19.4h   //x7 = x0 - x1;
354    add       v25.4h, v23.4h , v20.4h   //x6 = u_shift(x3,1,shft) + x2;
355    sub       v27.4h, v21.4h , v22.4h   //x8 = x3 - u_shift(x2,1,shft);
356
357    dup       v23.4s, w8                //load round factor values
358
359    st1       {v24.h}[0], [x10]         //store the dc value to alternate dc sddress
360//core tranform is done for 4x8 block 1
361    ld1       {v28.4h-v31.4h}, [x5]     //load the scaling values
362
363    abs       v0.4h, v24.4h             //abs val of row 1
364    abs       v1.4h, v25.4h             //abs val of row 2
365    abs       v2.4h, v26.4h             //abs val of row 3
366    abs       v3.4h, v27.4h             //abs val of row 4
367
368    cmgt      v4.4h, v24.4h, #0
369    cmgt      v5.4h, v25.4h, #0
370    cmgt      v6.4h, v26.4h, #0
371    cmgt      v7.4h, v27.4h, #0
372
373    smull     v0.4s, v0.4h, v28.4h      //multiply and add row 1
374    smull     v1.4s, v1.4h, v29.4h      //multiply and add row 2
375    smull     v2.4s, v2.4h, v30.4h      //multiply and add row 3
376    smull     v3.4s, v3.4h, v31.4h      //multiply and add row 4
377
378    add       v20.4s, v0.4s, v23.4s
379    add       v21.4s, v1.4s, v23.4s
380    add       v22.4s, v2.4s, v23.4s
381    add       v23.4s, v3.4s, v23.4s
382
383    dup       v24.4s, w7
384
385    sshl      v20.4s, v20.4s, v24.4s    //shift row 1
386    sshl      v21.4s, v21.4s, v24.4s    //shift row 2
387    sshl      v22.4s, v22.4s, v24.4s    //shift row 3
388    sshl      v23.4s, v23.4s, v24.4s    //shift row 4
389
390    xtn       v20.4h, v20.4s            //narrow row 1
391    xtn       v21.4h, v21.4s            //narrow row 2
392    xtn       v22.4h, v22.4s            //narrow row 3
393    xtn       v23.4h, v23.4s            //narrow row 4
394
395    neg       v24.8h, v20.8h            //get negative
396    neg       v25.8h, v21.8h            //get negative
397    neg       v26.8h, v22.8h            //get negative
398    neg       v27.8h, v23.8h            //get negative
399
400    //compare with zero for computng nnz
401    cmeq      v0.4h, v20.4h, #0
402    cmeq      v1.4h, v21.4h, #0
403    cmeq      v2.4h, v22.4h, #0
404    cmeq      v3.4h, v23.4h, #0
405
406    bsl       v4.8b, v20.8b, v24.8b     //restore sign of row 1 and 2
407    bsl       v5.8b, v21.8b, v25.8b     //restore sign of row 3 and 4
408    bsl       v6.8b, v22.8b, v26.8b     //restore sign of row 1 and 2
409    bsl       v7.8b, v23.8b, v27.8b     //restore sign of row 3 and 4
410
411    //narrow the comaprison result
412    mov       v0.d[1], v2.d[0]
413    mov       v1.d[1], v3.d[0]
414
415    xtn       v0.8b, v0.8h
416    xtn       v1.8b, v1.8h
417
418    ushr      v0.8b, v0.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
419    ushr      v1.8b, v1.8b, #7          //i    reduce comaparison bit to a signle bit row 1 and 2 blk  1 and 2 [ keep the value for later use ]
420
421    add       v0.8b, v0.8b, v1.8b       //i pair add nnz 1
422    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
423    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
424    addp      v0.8b, v0.8b, v0.8b       //i pair add nnz 1
425
426    st1       {v4.4h-v7.4h}, [x2]       //store blk
427
428    movi      v25.8b, #16               //get max nnz
429    sub       v26.8b, v25.8b , v0.8b    //invert current nnz
430    st1       {v26.b}[0], [x9]          //write nnz
431
432    pop_v_regs
433    ret
434
435
436//*****************************************************************************
437//*
438//* function name     : ih264_hadamard_quant_4x4_av8
439//* description       : this function does forward hadamard transform and
440//*                     quantization for luma dc block
441//*
442//* arguments         :  x0 :pointer to src buffer
443//                       x1 :pointer to dst buffer
444//                       x2 :pu2_scale_matrix
445//                       x3 :pu2_threshold_matrix
446//                       w4 :u4_qbits
447//                       w5 :u4_round_factor
448//                       x6 :pu1_nnz
449// values returned   : none
450//
451// register usage    :
452// stack usage       : 0 bytes
453// cycles            : around
454// interruptiaility  : interruptable
455//
456// known limitations
457//   \assumptions    :
458//
459// revision history  :
460//         dd mm yyyy    author(s)   changes
461//         20 2 2015    100633      first version
462//
463//*****************************************************************************
464//ih264_hadamard_quant_4x4_av8(word16 *pi2_src, word16 *pi2_dst,
465//                           const uword16 *pu2_scale_matrix,
466//                           const uword16 *pu2_threshold_matrix, uword32 u4_qbits,
467//                           uword32 u4_round_factor,uword8  *pu1_nnz
468//                           )
469    .global ih264_hadamard_quant_4x4_av8
470ih264_hadamard_quant_4x4_av8:
471
472//x0 :pointer to src buffer
473//x1 :pointer to dst buffer
474//x2 :pu2_scale_matrix
475//x3 :pu2_threshold_matrix
476//w4 :u4_qbits
477//w5 :u4_round_factor
478//x6 :pu1_nnz
479
480    push_v_regs
481
482    ld4       {v0.4h-v3.4h}, [x0]       //load 4x4 block
483    ld1       {v30.h}[0], [x2]          //load pu2_scale_matrix[0]
484
485    saddl     v4.4s, v0.4h, v3.4h       //x0 = x4 + x7;
486    saddl     v5.4s, v1.4h, v2.4h       //x1 = x5 + x6;
487    ssubl     v6.4s, v1.4h, v2.4h       //x2 = x5 - x6;
488    ssubl     v7.4s, v0.4h, v3.4h       //x3 = x4 - x7;
489
490    dup       v30.8h, v30.h[0]          //pu2_scale_matrix[0]
491
492    add       v14.4s, v4.4s, v5.4s      //pi2_dst[0] = x0 + x1;
493    add       v15.4s, v7.4s, v6.4s      //pi2_dst[1] = x3 + x2;
494    sub       v16.4s, v4.4s, v5.4s      //pi2_dst[2] = x0 - x1;
495    sub       v17.4s, v7.4s, v6.4s      //pi2_dst[3] = x3 - x2;
496
497    //transpose 4x4 block
498    trn1      v18.4s, v14.4s, v15.4s
499    trn2      v19.4s, v14.4s, v15.4s
500    trn1      v20.4s, v16.4s, v17.4s
501    trn2      v21.4s, v16.4s, v17.4s
502
503    trn1      v14.2d, v18.2d, v20.2d
504    trn2      v16.2d, v18.2d, v20.2d
505    trn1      v15.2d, v19.2d, v21.2d
506    trn2      v17.2d, v19.2d, v21.2d
507    //end transpose
508
509    add       v18.4s, v14.4s, v17.4s    //x0 = x4 + x7;
510    add       v19.4s, v15.4s, v16.4s    //x1 = x5 + x6;
511    sub       v20.4s, v15.4s, v16.4s    //x2 = x5 - x6;
512    sub       v21.4s, v14.4s, v17.4s    //x3 = x4 - x7;
513
514    dup       v14.4s, w5                //round factor
515    dup       v15.4s, v14.s[0]
516    dup       v16.4s, v14.s[0]
517    dup       v17.4s, v14.s[0]
518
519    add       v22.4s, v18.4s, v19.4s    //(x0 + x1)
520    add       v23.4s, v21.4s, v20.4s    //(x3 + x2)
521    sub       v24.4s, v18.4s, v19.4s    //(x0 - x1)
522    sub       v25.4s, v21.4s, v20.4s    //(x3 - x2)
523
524    shrn      v0.4h, v22.4s, #1         //i4_value = (x0 + x1) >> 1;
525    shrn2     v0.8h, v23.4s, #1         //i4_value = (x3 + x2) >> 1;
526    shrn      v1.4h, v24.4s, #1         //i4_value = (x0 - x1) >> 1;
527    shrn2     v1.8h, v25.4s, #1         //i4_value = (x3 - x2) >> 1;
528
529    abs       v2.8h, v0.8h
530    abs       v3.8h, v1.8h
531
532    cmgt      v4.8h, v0.8h, #0          //get the sign row 1,2
533    cmgt      v5.8h, v1.8h, #0
534
535    neg       w4, w4                    //-u4_qbits
536    dup       v22.4s, w4                //load  -u4_qbits
537
538    umlal     v14.4s, v2.4h, v30.4h
539    umlal2    v15.4s, v2.8h, v30.8h
540    umlal     v16.4s, v3.4h, v30.4h
541    umlal2    v17.4s, v3.8h, v30.8h
542
543    ushl      v14.4s, v14.4s, v22.4s
544    ushl      v15.4s, v15.4s, v22.4s
545    ushl      v16.4s, v16.4s, v22.4s
546    ushl      v17.4s, v17.4s, v22.4s
547
548    uqxtn     v14.4h, v14.4s
549    uqxtn2    v14.8h, v15.4s
550    uqxtn     v16.4h, v16.4s
551    uqxtn2    v16.8h, v17.4s
552
553    neg       v15.8h, v14.8h
554    neg       v17.8h, v16.8h
555
556    bsl       v4.16b, v14.16b, v15.16b
557    bsl       v5.16b, v16.16b, v17.16b
558
559    cmeq      v0.8h, v14.8h, #0
560    cmeq      v1.8h, v16.8h, #0
561
562    st1       {v4.8h-v5.8h}, [x1]
563
564    movi      v20.8b, #16
565
566    xtn       v2.8b, v0.8h
567    xtn       v3.8b, v1.8h
568
569    ushr      v2.8b, v2.8b, #7
570    ushr      v3.8b, v3.8b, #7
571
572    add       v2.8b, v2.8b, v3.8b
573    addp      v2.8b, v2.8b, v2.8b
574    addp      v2.8b, v2.8b, v2.8b
575    addp      v2.8b, v2.8b, v2.8b
576    sub       v20.8b, v20.8b, v2.8b
577    st1       {v20.b}[0], [x6]
578
579    pop_v_regs
580    ret
581
582
583//*****************************************************************************
584//*
585//* function name     : ih264_hadamard_quant_2x2_uv
586//* description       : this function does forward hadamard transform and
587//*                     quantization for dc block of chroma for both planes
588//*
589//* arguments         :  x0 :pointer to src buffer
590//                       x1 :pointer to dst buffer
591//                       x2 :pu2_scale_matrix
592//                       x3 :pu2_threshold_matrix
593//                       w4 :u4_qbits
594//                       w5 :u4_round_factor
595//                       x6 :pu1_nnz
596// values returned   : none
597//
598// register usage    :
599// stack usage       : 0 bytes
600// cycles            : around
601// interruptiaility  : interruptable
602//
603// known limitations
604//   \assumptions    :
605//
606// revision history  :
607//         dd mm yyyy    author(s)   changes
608//         20 2 2015    100633      first version
609//
610//*****************************************************************************
611// ih264_hadamard_quant_2x2_uv_av8(word16 *pi2_src, word16 *pi2_dst,
612//                             const uword16 *pu2_scale_matrix,
613//                             const uword16 *pu2_threshold_matrix, uword32 u4_qbits,
614//                             uword32 u4_round_factor,uword8  *pu1_nnz
615//                             )
616
617    .global ih264_hadamard_quant_2x2_uv_av8
618ih264_hadamard_quant_2x2_uv_av8:
619
620    push_v_regs
621
622    ld2       {v0.4h-v1.4h}, [x0]       //load src
623
624    ld1       {v30.h}[0], [x2]          //load pu2_scale_matrix[0]
625    dup       v30.4h, v30.h[0]          //pu2_scale_matrix
626    uxtl      v30.4s, v30.4h            //pu2_scale_matrix
627
628    neg       w4, w4
629    dup       v24.4s, w4                //u4_qbits
630
631    dup       v25.4s, w5                //round fact
632    dup       v26.4s, v25.s[0]
633
634    saddl     v2.4s, v0.4h, v1.4h       //x0 = x4 + x5;, x2 = x6 + x7;
635    ssubl     v3.4s, v0.4h, v1.4h       //x1 = x4 - x5;  x3 = x6 - x7;
636
637    trn1      v4.4s, v2.4s, v3.4s
638    trn2      v5.4s, v2.4s, v3.4s       //q1 -> x0 x1, q2 -> x2 x3
639
640    add       v0.4s, v4.4s , v5.4s      // (x0 + x2) (x1 + x3)  (y0 + y2); (y1 + y3);
641    sub       v1.4s, v4.4s , v5.4s      // (x0 - x2) (x1 - x3)  (y0 - y2); (y1 - y3);
642
643    abs       v2.4s, v0.4s
644    abs       v3.4s, v1.4s
645
646    cmgt      v4.4s, v0.4s, #0          //get the sign row 1,2
647    cmgt      v5.4s, v1.4s, #0
648
649    uqxtn     v4.4h, v4.4s
650    sqxtn2    v4.8h, v5.4s
651
652    mla       v25.4s, v2.4s, v30.4s
653    mla       v26.4s, v3.4s, v30.4s
654
655    ushl      v2.4s, v25.4s, v24.4s     //>>qbit
656    ushl      v3.4s, v26.4s, v24.4s     //>>qbit
657
658    uqxtn     v2.4h, v2.4s
659    uqxtn2    v2.8h, v3.4s
660
661    neg       v5.8h, v2.8h
662
663    bsl       v4.16b, v2.16b, v5.16b    //*sign
664
665    //rearrange such that we get each plane coeffs as continous
666    mov       v5.s[0], v4.s[1]
667    mov       v4.s[1], v4.s[2]
668    mov       v4.s[2], v5.s[0]
669
670    cmeq      v5.8h, v4.8h, #0          //compute nnz
671    xtn       v5.8b, v5.8h              //reduce nnz comparison to 1 bit
672    ushr      v5.8b, v5.8b, #7          //reduce nnz comparison to 1 bit
673    movi      v20.8b, #4                //since we add zeros, we need to subtract from 4 to get nnz
674    addp      v5.8b, v5.8b, v5.8b       //sum up nnz
675    addp      v5.8b, v5.8b, v5.8b       //sum up nnz
676
677    st1       {v4.8h}, [x1]             //store the block
678
679    st1       {v4.8h}, [x1]             //store the block
680    sub       v20.8b, v20.8b, v5.8b     //4- numzeros
681
682    st1       {v20.h}[0], [x6]          //store nnz
683
684    pop_v_regs
685    ret
686
687
688
689