ih264_padding_neon_av8.s revision a2b49e5f0574dee76f81507f288143d83a4b7c1a
1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21// *******************************************************************************
22// * @file
23// *  ih264_padding_neon.s
24// *
25// * @brief
26// *  Contains function definitions padding
27// *
28// * @author
29// *     Ittiam
30// *
31// * @par List of Functions:
32// *  - ih264_pad_top_av8()
33// *  - ih264_pad_left_luma_av8()
34// *  - ih264_pad_left_chroma_av8()
35// *  - ih264_pad_right_luma_av8()
36// *  - ih264_pad_right_chroma_av8()
37// *
38// * @remarks
39// *  None
40// *
41// *******************************************************************************
42//*/
43
44.text
45.p2align 2
46.include "ih264_neon_macros.s"
47///**
48//*******************************************************************************
49//*
50//* @brief pad at the top of a 2d array
51//*
52//* @par Description:
53//*  The top row of a 2d array is replicated for pad_size times at the top
54//*
55//* @param[in] pu1_src
56//*  UWORD8 pointer to the source
57//*
58//* @param[in] src_strd
59//*  integer source stride
60//*
61//* @param[in] wd
62//*  integer width of the array
63//*
64//* @param[in] pad_size
65//*  integer -padding size of the array
66//*
67//* @returns none
68//*
69//* @remarks none
70//*
71//*******************************************************************************
72//*/
73//void ih264_pad_top(UWORD8 *pu1_src,
74//                   WORD32 src_strd,
75//                   WORD32 wd,
76//                   WORD32 pad_size)
77//**************Variables Vs Registers*************************
78//    x0 => *pu1_src
79//    x1 => src_strd
80//    x2 => wd
81//    x3 => pad_size
82
83    .global ih264_pad_top_av8
84
85ih264_pad_top_av8:
86
87    // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
88    push_v_regs
89    stp       x19, x20, [sp, #-16]!
90
91    sub       x5, x0, x1
92    sub       x20, x1, #0
93    neg       x6, x20
94
95loop_neon_memcpy_mul_16:
96    // Load 16 bytes
97    ld1       {v0.8b, v1.8b}, [x0], #16
98    mov       x4, x5
99    mov       x7, x3
100    add       x5, x5, #16
101
102loop_neon_pad_top:
103    st1       {v0.8b, v1.8b}, [x4], x6
104    subs      x7, x7, #1
105    bne       loop_neon_pad_top
106
107    subs      x2, x2, #16
108    bne       loop_neon_memcpy_mul_16
109
110    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
111    ldp       x19, x20, [sp], #16
112    pop_v_regs
113    ret
114
115
116
117
118///**
119//*******************************************************************************
120//*
121//* @brief
122//*   Padding (luma block) at the left of a 2d array
123//*
124//* @par Description:
125//*   The left column of a 2d array is replicated for pad_size times at the left
126//*
127//*
128//* @param[in] pu1_src
129//*  UWORD8 pointer to the source
130//*
131//* @param[in] src_strd
132//*  integer source stride
133//*
134//* @param[in] ht
135//*  integer height of the array
136//*
137//* @param[in] wd
138//*  integer width of the array
139//*
140//* @param[in] pad_size
141//*  integer -padding size of the array
142//*
143//* @param[in] ht
144//*  integer height of the array
145//*
146//* @param[in] wd
147//*  integer width of the array
148//*
149//* @returns
150//*
151//* @remarks
152//*  None
153//*
154//*******************************************************************************
155//*/
156//#if PAD_LEFT_LUMA == C
157//void ih264_pad_left_luma(UWORD8 *pu1_src,
158//                        WORD32 src_strd,
159//                        WORD32 ht,
160//                        WORD32 pad_size)
161//**************Variables Vs Registers*************************
162//    x0 => *pu1_src
163//    x1 => src_strd
164//    x2 => ht
165//    x3 => pad_size
166
167
168
169    .global ih264_pad_left_luma_av8
170
171ih264_pad_left_luma_av8:
172
173    // STMFD sp!, {x4-x11,x14}                //stack stores the values of the arguments
174    push_v_regs
175    stp       x19, x20, [sp, #-16]!
176
177
178    sub       x4, x0, x3
179    sub       x6, x1, #16
180    subs      x5, x3, #16
181    bne       loop_32
182loop_16:                                //  /*hard coded for width=16  ,height =8,16*/
183    ldrb      w8, [x0]
184    add       x0, x0, x1
185    sxtw      x8, w8
186    ldrb      w9, [x0]
187    add       x0, x0, x1
188    sxtw      x9, w9
189    dup       v0.16b, w8
190    ldrb      w10, [x0]
191    add       x0, x0, x1
192    sxtw      x10, w10
193    st1       {v0.16b}, [x4], x1        // 16 bytes store
194    dup       v2.16b, w9
195    st1       {v2.16b}, [x4], x1        // 16 bytes store
196    ldrb      w11, [x0]
197    add       x0, x0, x1
198    sxtw      x11, w11
199    dup       v4.16b, w10
200    dup       v6.16b, w11
201    st1       {v4.16b}, [x4], x1        // 16 bytes store
202    ldrb      w8, [x0]
203    add       x0, x0, x1
204    sxtw      x8, w8
205    st1       {v6.16b}, [x4], x1        // 16 bytes store
206    ldrb      w9, [x0]
207    add       x0, x0, x1
208    sxtw      x9, w9
209    dup       v0.16b, w8
210    ldrb      w10, [x0]
211    add       x0, x0, x1
212    sxtw      x10, w10
213    st1       {v0.16b}, [x4], x1        // 16 bytes store
214    dup       v2.16b, w9
215    ldrb      w11, [x0]
216    add       x0, x0, x1
217    sxtw      x11, w11
218    st1       {v2.16b}, [x4], x1        // 16 bytes store
219    dup       v4.16b, w10
220    dup       v6.16b, w11
221    subs      x2, x2, #8
222    st1       {v4.16b}, [x4], x1        // 16 bytes store
223    st1       {v6.16b}, [x4], x1        // 16 bytes store
224    bne       loop_16
225    b         end_func
226
227loop_32:                                //  /*hard coded for width=32 ,height =8,16*/
228    ldrb      w8, [x0]
229    add       x0, x0, x1
230    sxtw      x8, w8
231    ldrb      w9, [x0]
232    add       x0, x0, x1
233    sxtw      x9, w9
234    dup       v0.16b, w8
235    ldrb      w10, [x0]
236    add       x0, x0, x1
237    sxtw      x10, w10
238    st1       {v0.16b}, [x4], #16       // 16 bytes store
239    dup       v2.16b, w9
240    st1       {v0.16b}, [x4], x6
241    st1       {v2.16b}, [x4], #16       // 16 bytes store
242    dup       v4.16b, w10
243    st1       {v2.16b}, [x4], x6        // 16 bytes store
244    ldrb      w11, [x0]
245    add       x0, x0, x1
246    sxtw      x11, w11
247    st1       {v4.16b}, [x4], #16       // 16 bytes store
248    dup       v6.16b, w11
249    st1       {v4.16b}, [x4], x6        // 16 bytes store
250    ldrb      w8, [x0]
251    add       x0, x0, x1
252    sxtw      x8, w8
253    st1       {v6.16b}, [x4], #16       // 16 bytes store
254    dup       v0.16b, w8
255    ldrb      w9, [x0]
256    add       x0, x0, x1
257    sxtw      x9, w9
258    st1       {v6.16b}, [x4], x6        // 16 bytes store
259    ldrb      w10, [x0]
260    add       x0, x0, x1
261    sxtw      x10, w10
262    st1       {v0.16b}, [x4], #16       // 16 bytes store
263    dup       v2.16b, w9
264    st1       {v0.16b}, [x4], x6        // 16 bytes store
265    ldrb      w11, [x0]
266    add       x0, x0, x1
267    sxtw      x11, w11
268    st1       {v2.16b}, [x4], #16       // 16 bytes store
269    dup       v4.16b, w10
270    st1       {v2.16b}, [x4], x6        // 16 bytes store
271    st1       {v4.16b}, [x4], #16       // 16 bytes store
272    dup       v6.16b, w11
273    st1       {v4.16b}, [x4], x6        // 16 bytes store
274    subs      x2, x2, #8
275    st1       {v6.16b}, [x4], #16       // 16 bytes store
276    st1       {v6.16b}, [x4], x6        // 16 bytes store
277    bne       loop_32
278
279
280
281end_func:
282    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
283    ldp       x19, x20, [sp], #16
284    pop_v_regs
285    ret
286
287
288
289
290
291///**
292//*******************************************************************************
293//*
294//* @brief
295//*   Padding (chroma block) at the left of a 2d array
296//*
297//* @par Description:
298//*   The left column of a 2d array is replicated for pad_size times at the left
299//*
300//*
301//* @param[in] pu1_src
302//*  UWORD8 pointer to the source
303//*
304//* @param[in] src_strd
305//*  integer source stride
306//*
307//* @param[in] ht
308//*  integer height of the array
309//*
310//* @param[in] wd
311//*  integer width of the array (each colour component)
312//*
313//* @param[in] pad_size
314//*  integer -padding size of the array
315//*
316//* @param[in] ht
317//*  integer height of the array
318//*
319//* @param[in] wd
320//*  integer width of the array
321//*
322//* @returns
323//*
324//* @remarks
325//*  None
326//*
327//*******************************************************************************
328//*/
329//#if PAD_LEFT_CHROMA == C
330//void ih264_pad_left_chroma(UWORD8 *pu1_src,
331//                            WORD32 src_strd,
332//                            WORD32 ht,
333//                            WORD32 pad_size)
334//{
335//    x0 => *pu1_src
336//    x1 => src_strd
337//    x2 => ht
338//    x3 => pad_size
339
340
341
342    .global ih264_pad_left_chroma_av8
343
344ih264_pad_left_chroma_av8:
345
346    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
347    push_v_regs
348    stp       x19, x20, [sp, #-16]!
349
350    sub       x4, x0, x3
351    sub       x6, x1, #16
352
353
354loop_32_l_c:                            //  /*hard coded for width=32  ,height =4,8,12*/
355    ldrh      w8, [x0]
356    add       x0, x0, x1
357    sxtw      x8, w8
358    ldrh      w9, [x0]
359    add       x0, x0, x1
360    sxtw      x9, w9
361    dup       v0.8h, w8
362    ldrh      w10, [x0]
363    add       x0, x0, x1
364    sxtw      x10, w10
365    st1       {v0.16b}, [x4], #16       // 16 bytes store
366    dup       v2.8h, w9
367    st1       {v0.16b}, [x4], x6        // 16 bytes store
368    ldrh      w11, [x0]
369    add       x0, x0, x1
370    sxtw      x11, w11
371    st1       {v2.16b}, [x4], #16       // 16 bytes store
372    dup       v4.8h, w10
373    st1       {v2.16b}, [x4], x6        // 16 bytes store
374    dup       v6.8h, w11
375    st1       {v4.16b}, [x4], #16       // 16 bytes store
376    st1       {v4.16b}, [x4], x6        // 16 bytes store
377    subs      x2, x2, #4
378    st1       {v6.16b}, [x4], #16       // 16 bytes store
379    st1       {v6.16b}, [x4], x6        // 16 bytes store
380
381
382    beq       end_func_l_c              ///* Branching when ht=4*/
383
384    ldrh      w8, [x0]
385    add       x0, x0, x1
386    sxtw      x8, w8
387    ldrh      w9, [x0]
388    add       x0, x0, x1
389    sxtw      x9, w9
390    dup       v0.8h, w8
391    ldrh      w10, [x0]
392    add       x0, x0, x1
393    sxtw      x10, w10
394    st1       {v0.16b}, [x4], #16       // 16 bytes store
395    dup       v2.8h, w9
396    st1       {v0.16b}, [x4], x6
397    ldrh      w11, [x0]
398    add       x0, x0, x1
399    sxtw      x11, w11
400    st1       {v2.16b}, [x4], #16       // 16 bytes store
401    dup       v4.8h, w10
402    st1       {v2.16b}, [x4], x6        // 16 bytes store
403    dup       v6.8h, w11
404    st1       {v4.16b}, [x4], #16       // 16 bytes store
405    st1       {v4.16b}, [x4], x6        // 16 bytes store
406    subs      x2, x2, #4
407    st1       {v6.16b}, [x4], #16       // 16 bytes store
408    st1       {v6.16b}, [x4], x6        // 16 bytes store
409
410    beq       end_func_l_c              ///* Branching when ht=8*/
411    bne       loop_32_l_c
412
413    ldrh      w8, [x0]
414    add       x0, x0, x1
415    sxtw      x8, w8
416    ldrh      w9, [x0]
417    add       x0, x0, x1
418    sxtw      x9, w9
419    dup       v0.8h, w8
420    ldrh      w10, [x0]
421    add       x0, x0, x1
422    sxtw      x10, w10
423    st1       {v0.16b}, [x4], #16       // 16 bytes store
424    dup       v2.8h, w9
425    st1       {v0.16b}, [x4], x6
426    ldrh      w11, [x0]
427    add       x0, x0, x1
428    sxtw      x11, w11
429    st1       {v2.16b}, [x4], #16       // 16 bytes store
430    dup       v4.8h, w10
431    st1       {v2.16b}, [x4], x6        // 16 bytes store
432    dup       v6.8h, w11
433    st1       {v4.16b}, [x4], #16       // 16 bytes store
434    st1       {v4.16b}, [x4], x6        // 16 bytes store
435    st1       {v6.16b}, [x4], #16       // 16 bytes store
436    st1       {v6.16b}, [x4], x6        // 16 bytes store
437
438end_func_l_c:
439    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
440    ldp       x19, x20, [sp], #16
441    pop_v_regs
442    ret
443
444
445
446
447
448///**
449//*******************************************************************************
450//*
451//* @brief
452//* Padding (luma block) at the right of a 2d array
453//*
454//* @par Description:
455//* The right column of a 2d array is replicated for pad_size times at the right
456//*
457//*
458//* @param[in] pu1_src
459//*  UWORD8 pointer to the source
460//*
461//* @param[in] src_strd
462//*  integer source stride
463//*
464//* @param[in] ht
465//*  integer height of the array
466//*
467//* @param[in] wd
468//*  integer width of the array
469//*
470//* @param[in] pad_size
471//*  integer -padding size of the array
472//*
473//* @param[in] ht
474//*  integer height of the array
475//*
476//* @param[in] wd
477//*  integer width of the array
478//*
479//* @returns
480//*
481//* @remarks
482//*  None
483//*
484//*******************************************************************************
485//*/
486//#if PAD_RIGHT_LUMA == C
487//void ih264_pad_right_luma(UWORD8 *pu1_src,
488//                        WORD32 src_strd,
489//                        WORD32 ht,
490//                        WORD32 pad_size)
491//{
492//    WORD32 row;
493//
494//    for(row = 0; row < ht; row++)
495//    {
496//        memset(pu1_src, *(pu1_src -1), pad_size);
497//
498//        pu1_src += src_strd;
499//    }
500//}
501//
502//    x0 => *pu1_src
503//    x1 => src_strd
504//    x2 => ht
505//    x3 => pad_size
506
507
508
509    .global ih264_pad_right_luma_av8
510
511ih264_pad_right_luma_av8:
512
513    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
514    push_v_regs
515    stp       x19, x20, [sp, #-16]!
516
517    mov       x4, x0
518    sub       x6, x1, #16
519    sub       x0, x0, #1
520    subs      x5, x3, #16
521    bne       loop_32
522loop_16_r: //  /*hard coded for width=16  ,height =8,16*/
523    ldrb      w8, [x0]
524    add       x0, x0, x1
525    sxtw      x8, w8
526    ldrb      w9, [x0]
527    add       x0, x0, x1
528    sxtw      x9, w9
529    dup       v0.16b, w8
530    ldrb      w10, [x0]
531    add       x0, x0, x1
532    sxtw      x10, w10
533    st1       {v0.16b}, [x4], x1        // 16 bytes store
534    dup       v2.16b, w9
535    st1       {v2.16b}, [x4], x1        // 16 bytes store
536    ldrb      w11, [x0]
537    add       x0, x0, x1
538    sxtw      x11, w11
539    dup       v4.16b, w10
540    dup       v6.16b, w11
541    st1       {v4.16b}, [x4], x1        // 16 bytes store
542    ldrb      w8, [x0]
543    add       x0, x0, x1
544    sxtw      x8, w8
545    st1       {v6.16b}, [x4], x1        // 16 bytes store
546    ldrb      w9, [x0]
547    add       x0, x0, x1
548    sxtw      x9, w9
549    dup       v0.16b, w8
550    ldrb      w10, [x0]
551    add       x0, x0, x1
552    sxtw      x10, w10
553    st1       {v0.16b}, [x4], x1        // 16 bytes store
554    dup       v2.16b, w9
555    ldrb      w11, [x0]
556    add       x0, x0, x1
557    sxtw      x11, w11
558    st1       {v2.16b}, [x4], x1        // 16 bytes store
559    dup       v4.16b, w10
560    dup       v6.16b, w11
561    subs      x2, x2, #8
562    st1       {v4.16b}, [x4], x1        // 16 bytes store
563    st1       {v6.16b}, [x4], x1        // 16 bytes store
564    bne       loop_16_r
565    b         end_func_r
566
567loop_32_r:                              //  /*hard coded for width=32  ,height =8,16*/
568    ldrb      w8, [x0]
569    add       x0, x0, x1
570    sxtw      x8, w8
571    ldrb      w9, [x0]
572    add       x0, x0, x1
573    sxtw      x9, w9
574    dup       v0.16b, w8
575    ldrb      w10, [x0]
576    add       x0, x0, x1
577    sxtw      x10, w10
578    st1       {v0.16b}, [x4], #16       // 16 bytes store
579    dup       v2.16b, w9
580    st1       {v0.16b}, [x4], x6
581    st1       {v2.16b}, [x4], #16       // 16 bytes store
582    dup       v4.16b, w10
583    st1       {v2.16b}, [x4], x6        // 16 bytes store
584    ldrb      w11, [x0]
585    add       x0, x0, x1
586    sxtw      x11, w11
587    st1       {v4.16b}, [x4], #16       // 16 bytes store
588    dup       v6.16b, w11
589    st1       {v4.16b}, [x4], x6        // 16 bytes store
590    ldrb      w8, [x0]
591    add       x0, x0, x1
592    sxtw      x8, w8
593    st1       {v6.16b}, [x4], #16       // 16 bytes store
594    ldrb      w9, [x0]
595    add       x0, x0, x1
596    sxtw      x9, w9
597    dup       v0.16b, w8
598    st1       {v6.16b}, [x4], x6        // 16 bytes store
599    ldrb      w10, [x0]
600    add       x0, x0, x1
601    sxtw      x10, w10
602    st1       {v0.16b}, [x4], #16       // 16 bytes store
603    dup       v2.16b, w9
604    st1       {v0.16b}, [x4], x6        // 16 bytes store
605    ldrb      w11, [x0]
606    add       x0, x0, x1
607    sxtw      x11, w11
608    st1       {v2.16b}, [x4], #16       // 16 bytes store
609    dup       v4.16b, w10
610    st1       {v2.16b}, [x4], x6        // 16 bytes store
611    st1       {v4.16b}, [x4], #16       // 16 bytes store
612    dup       v6.16b, w11
613    st1       {v4.16b}, [x4], x6        // 16 bytes store
614    subs      x2, x2, #8
615    st1       {v6.16b}, [x4], #16       // 16 bytes store
616    st1       {v6.16b}, [x4], x6        // 16 bytes store
617    bne       loop_32_r
618
619
620
621end_func_r:
622    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
623    ldp       x19, x20, [sp], #16
624    pop_v_regs
625    ret
626
627
628
629
630
631///**
632//*******************************************************************************
633//*
634//* @brief
635//;* Padding (chroma block) at the right of a 2d array
636//*
637//* @par Description:
638//* The right column of a 2d array is replicated for pad_size times at the right
639//*
640//*
641//* @param[in] pu1_src
642//;*  UWORD8 pointer to the source
643//*
644//* @param[in] src_strd
645//*  integer source stride
646//*
647//* @param[in] ht
648//;*  integer height of the array
649//*
650//* @param[in] wd
651//*  integer width of the array (each colour component)
652//*
653//* @param[in] pad_size
654//*  integer -padding size of the array
655//*
656//* @param[in] ht
657//;*  integer height of the array
658//*
659//* @param[in] wd
660//*  integer width of the array
661//*
662//* @returns
663//*
664//* @remarks
665//*  None
666//*
667//*******************************************************************************
668//*/
669//#if PAD_RIGHT_CHROMA == C
670//void ih264_pad_right_chroma(UWORD8 *pu1_src,
671//                        WORD32 src_strd,
672//                        WORD32 ht,
673//                        WORD32 pad_size)
674//    x0 => *pu1_src
675//    x1 => src_strd
676//    x2 => ht
677//    x3 => pad_size
678
679
680
681    .global ih264_pad_right_chroma_av8
682
683ih264_pad_right_chroma_av8:
684
685    // STMFD sp!, {x4-x11, x14}                //stack stores the values of the arguments
686    push_v_regs
687    stp       x19, x20, [sp, #-16]!
688
689    mov       x4, x0
690    sub       x6, x1, #16
691    sub       x0, x0, #2
692loop_32_r_c: //  /*hard coded for width=32 ,height =8,4*/
693    ldrh      w8, [x0]
694    add       x0, x0, x1
695    sxtw      x8, w8
696    ldrh      w9, [x0]
697    add       x0, x0, x1
698    sxtw      x9, w9
699    dup       v0.8h, w8
700    ldrh      w10, [x0]
701    add       x0, x0, x1
702    sxtw      x10, w10
703    st1       {v0.16b}, [x4], #16       // 16 bytes store
704    dup       v2.8h, w9
705    st1       {v0.16b}, [x4], x6
706    st1       {v2.16b}, [x4], #16       // 16 bytes store
707    dup       v4.8h, w10
708    st1       {v2.16b}, [x4], x6        // 16 bytes store
709    subs      x2, x2, #4
710    ldrh      w11, [x0]
711    add       x0, x0, x1
712    sxtw      x11, w11
713    st1       {v4.16b}, [x4], #16       // 16 bytes store
714    dup       v6.8h, w11
715    st1       {v4.16b}, [x4], x6        // 16 bytes store
716    st1       {v6.16b}, [x4], #16       // 16 bytes store
717    st1       {v6.16b}, [x4], x6        // 16 bytes store
718
719    beq       end_func_r_c              ///* Branching when ht=4*/
720
721    ldrh      w8, [x0]
722    add       x0, x0, x1
723    sxtw      x8, w8
724    dup       v0.8h, w8
725    ldrh      w9, [x0]
726    add       x0, x0, x1
727    sxtw      x9, w9
728    ldrh      w10, [x0]
729    add       x0, x0, x1
730    sxtw      x10, w10
731    st1       {v0.16b}, [x4], #16       // 16 bytes store
732    dup       v2.8h, w9
733    st1       {v0.16b}, [x4], x6        // 16 bytes store
734    ldrh      w11, [x0]
735    add       x0, x0, x1
736    sxtw      x11, w11
737    st1       {v2.16b}, [x4], #16       // 16 bytes store
738    dup       v4.8h, w10
739    st1       {v2.16b}, [x4], x6        // 16 bytes store
740    st1       {v4.16b}, [x4], #16       // 16 bytes store
741    dup       v6.8h, w11
742    st1       {v4.16b}, [x4], x6        // 16 bytes store
743    subs      x2, x2, #4
744    st1       {v6.16b}, [x4], #16       // 16 bytes store
745    st1       {v6.16b}, [x4], x6        // 16 bytes store
746
747    beq       end_func_r_c              ///* Branching when ht=8*/
748    bne       loop_32_r_c
749    ldrh      w8, [x0]
750    add       x0, x0, x1
751    sxtw      x8, w8
752    dup       v0.8h, w8
753    ldrh      w9, [x0]
754    add       x0, x0, x1
755    sxtw      x9, w9
756    ldrh      w10, [x0]
757    add       x0, x0, x1
758    sxtw      x10, w10
759    st1       {v0.16b}, [x4], #16       // 16 bytes store
760    dup       v2.8h, w9
761    st1       {v0.16b}, [x4], x6        // 16 bytes store
762    ldrh      w11, [x0]
763    add       x0, x0, x1
764    sxtw      x11, w11
765    st1       {v2.16b}, [x4], #16       // 16 bytes store
766    dup       v4.8h, w10
767    st1       {v2.16b}, [x4], x6        // 16 bytes store
768    st1       {v4.16b}, [x4], #16       // 16 bytes store
769    dup       v6.8h, w11
770    st1       {v4.16b}, [x4], x6        // 16 bytes store
771    st1       {v6.16b}, [x4], #16       // 16 bytes store
772    st1       {v6.16b}, [x4], x6        // 16 bytes store
773
774end_func_r_c:
775    // LDMFD sp!,{x4-x11,pc}                //Reload the registers from SP
776    ldp       x19, x20, [sp], #16
777    pop_v_regs
778    ret
779
780
781
782
783
784
785