1@/*****************************************************************************
2@*
3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4@*
5@* Licensed under the Apache License, Version 2.0 (the "License");
6@* you may not use this file except in compliance with the License.
7@* You may obtain a copy of the License at:
8@*
9@* http://www.apache.org/licenses/LICENSE-2.0
10@*
11@* Unless required by applicable law or agreed to in writing, software
12@* distributed under the License is distributed on an "AS IS" BASIS,
13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14@* See the License for the specific language governing permissions and
15@* limitations under the License.
16@*
17@*****************************************************************************/
18@/**
19@******************************************************************************
20@* @file
21@*  ihevc_inter_pred_luma_horz.s
22@*
23@* @brief
24@*  contains function definitions for inter prediction  interpolation.
25@* functions are coded using neon  intrinsics and can be compiled using
26
27@* rvct
28@*
29@* @author
30@*  parthiban v
31@*
32@* @par list of functions:
33@*
34@*  - ihevc_inter_pred_luma_horz()
35@*
36@* @remarks
37@*  none
38@*
39@*******************************************************************************
40@*/
41
42@/* all the functions here are replicated from ihevc_inter_pred_filters.c and modified to */
43@/* include reconstruction */
44@
45
46@/**
47@*******************************************************************************
48@*
49@* @brief
50@*     interprediction luma filter for vertical input
51@*
52@* @par description:
53@*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
54@*    the elements pointed by 'pu1_src' and  writes to the location pointed by
55@*    'pu1_dst'  the output is downshifted by 6 and clipped to 8 bits
56@*    assumptions : the function is optimized considering the fact width is
57@*    multiple of 4 or 8. and height as multiple of 2.
58@*
59@* @param[in] pu1_src
60@*  uword8 pointer to the source
61@*
62@* @param[out] pu1_dst
63@*  uword8 pointer to the destination
64@*
65@* @param[in] src_strd
66@*  integer source stride
67@*
68@* @param[in] dst_strd
69@*  integer destination stride
70@*
71@* @param[in] pi1_coeff
72@*  word8 pointer to the filter coefficients
73@*
74@* @param[in] ht
75@*  integer height of the array
76@*
77@* @param[in] wd
78@*  integer width of the array
79@*
80@* @returns
81@*
82@* @remarks
83@*  none
84@*
85@*******************************************************************************
86@*/
87
88@void ihevc_inter_pred_luma_horz (
89@                            uword8 *pu1_src,
90@                            uword8 *pu1_dst,
91@                            word32 src_strd,
92@                            word32 dst_strd,
93@                            word8 *pi1_coeff,
94@                            word32 ht,
95@                            word32 wd   )
96
97@**************variables vs registers*****************************************
98@   r0 => *pu1_src
99@   r1 => *pu1_dst
100@   r2 =>  src_strd
101@   r3 =>  dst_strd
102@   r4 => *pi1_coeff
103@   r5 =>  ht
104@   r6 =>  wd
105
106.text
107.align 4
108
109
110
111
112.globl ihevc_inter_pred_luma_horz_a9q
113
114.type ihevc_inter_pred_luma_horz_a9q, %function
115
116ihevc_inter_pred_luma_horz_a9q:
117
118    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
119    @str        r1,[sp,#-4]
120    @ mov       r7,#8192
121start_loop_count:
122    @ ldr       r1,[sp,#-4]
123
124
125    ldr         r4,[sp,#40]                 @loads pi1_coeff
126    ldr         r8,[sp,#44]                 @loads ht
127    ldr         r10,[sp,#48]                @loads wd
128
129    vld1.8      {d0},[r4]                   @coeff = vld1_s8(pi1_coeff)
130    mov         r11,#1
131    subs        r14,r8,#0                   @checks for ht == 0
132
133    vabs.s8     d2,d0                       @vabs_s8(coeff)
134
135    @ble       end_loops
136
137
138    vdup.8      d24,d2[0]                   @coeffabs_0 = vdup_lane_u8(coeffabs, 0)
139    sub         r12,r0,#3                   @pu1_src - 3
140    vdup.8      d25,d2[1]                   @coeffabs_1 = vdup_lane_u8(coeffabs, 1)
141    add         r4,r12,r2                   @pu1_src_tmp2_8 = pu1_src + src_strd
142    vdup.8      d26,d2[2]                   @coeffabs_2 = vdup_lane_u8(coeffabs, 2)
143    rsb         r9,r10,r2,lsl #1            @2*src_strd - wd
144    vdup.8      d27,d2[3]                   @coeffabs_3 = vdup_lane_u8(coeffabs, 3)
145    rsb         r8,r10,r3,lsl #1            @2*dst_strd - wd
146    vdup.8      d28,d2[4]                   @coeffabs_4 = vdup_lane_u8(coeffabs, 4)
147
148    vdup.8      d29,d2[5]                   @coeffabs_5 = vdup_lane_u8(coeffabs, 5)
149    @ tst       r10,#7                          @checks wd for multiples
150    vdup.8      d30,d2[6]                   @coeffabs_6 = vdup_lane_u8(coeffabs, 6)
151    vdup.8      d31,d2[7]                   @coeffabs_7 = vdup_lane_u8(coeffabs, 7)
152
153    mov         r7,r1
154
155    cmp         r10,#4
156    ble         outer_loop_4
157
158    cmp         r10,#24
159    moveq       r10,#16
160    addeq       r8,#8
161    addeq       r9,#8
162
163    cmp         r10,#16
164    bge         outer_loop_16
165
166    cmp         r10,#12
167    addeq       r8,#4
168    addeq       r9,#4
169    b           outer_loop_8
170
171
172outer_loop8_residual:
173    sub         r12,r0,#3                   @pu1_src - 3
174    mov         r1,r7
175    mov         r14,#32
176    add         r1,#16
177    add         r12,#16
178    mov         r10,#8
179    add         r8,#8
180    add         r9,#8
181
182outer_loop_8:
183
184    add         r6,r1,r3                    @pu1_dst + dst_strd
185    add         r4,r12,r2                   @pu1_src + src_strd
186    subs        r5,r10,#0                   @checks wd
187
188    ble         end_inner_loop_8
189
190inner_loop_8:
191    vld1.u32    {d0},[r12],r11              @vector load pu1_src
192    vld1.u32    {d1},[r12],r11
193    vld1.u32    {d2},[r12],r11
194    vld1.u32    {d3},[r12],r11
195
196
197
198
199
200    @ vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
201    @ vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
202    @ vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
203    @ vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
204    @ vext.u8   d6,d0,d1,#6                     @vector extract of src [0_6]
205    @ vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
206    @ vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
207    @ vext.u8   d14,d12,d13,#2
208
209    @vext.u8    d15,d12,d13,#3                  @vector extract of src[0_3]
210    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
211    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
212    @vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
213    @vext.u8    d19,d12,d13,#7                  @vector extract of src[0_7]
214    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
215    vld1.u32    {d4},[r12],r11
216    vmull.u8    q4,d1,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
217    vld1.u32    {d5},[r12],r11
218    vmlal.u8    q4,d3,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
219    vld1.u32    {d6},[r12],r11
220    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
221    vld1.u32    {d7},[r12],r11
222    vmlsl.u8    q4,d2,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
223    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
224    vmlal.u8    q4,d4,d28                   @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
225    vld1.u32    {d13},[r4],r11
226    vmlsl.u8    q4,d5,d29                   @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
227    vld1.u32    {d14},[r4],r11
228    vmlal.u8    q4,d6,d30                   @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
229    vld1.u32    {d15},[r4],r11
230    vmlsl.u8    q4,d7,d31                   @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
231    vld1.u32    {d16},[r4],r11              @vector load pu1_src + src_strd
232
233    vmull.u8    q5,d15,d27                  @mul_res = vmull_u8(src[0_3], coeffabs_3)@
234    vld1.u32    {d17},[r4],r11
235    vmlsl.u8    q5,d14,d26                  @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
236    vld1.u32    {d18},[r4],r11
237    vmlal.u8    q5,d16,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
238    vld1.u32    {d19},[r4],r11              @vector load pu1_src + src_strd
239    vmlsl.u8    q5,d17,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
240    vqrshrun.s16 d20,q4,#6                  @right shift and saturating narrow result 1
241    vmlal.u8    q5,d18,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
242    vmlsl.u8    q5,d19,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
243    vst1.8      {d20},[r1]!                 @store the result pu1_dst
244    vmlsl.u8    q5,d12,d24                  @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
245    vmlal.u8    q5,d13,d25                  @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
246
247
248
249    vqrshrun.s16 d8,q5,#6                   @right shift and saturating narrow result 2
250    subs        r5,r5,#8                    @decrement the wd loop
251    vst1.8      {d8},[r6]!                  @store the result pu1_dst
252    cmp         r5,#4
253    bgt         inner_loop_8
254
255end_inner_loop_8:
256    subs        r14,r14,#2                  @decrement the ht loop
257    add         r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
258    add         r1,r1,r8                    @increment the dst pointer by 2*dst_strd-wd
259    bgt         outer_loop_8
260
261
262
263
264
265    ldr         r10,[sp,#48]                @loads wd
266    cmp         r10,#12
267
268    beq         outer_loop4_residual
269
270
271end_loops:
272
273    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
274
275
276
277
278
279
280outer_loop_16:
281    str         r0, [sp, #-4]!
282    str         r7, [sp, #-4]!
283
284    add         r6,r1,r3                    @pu1_dst + dst_strd
285    add         r4,r12,r2                   @pu1_src + src_strd
286    and         r0, r12, #31
287    sub         r5,r10,#0                   @checks wd
288    @ble       end_loops1
289    pld         [r12, r2, lsl #1]
290    vld1.u32    {q0},[r12],r11              @vector load pu1_src
291    pld         [r4, r2, lsl #1]
292    vld1.u32    {q1},[r12],r11
293    vld1.u32    {q2},[r12],r11
294    vld1.u32    {q3},[r12],r11
295    vld1.u32    {q6},[r12],r11
296    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
297    vld1.u32    {q7},[r12],r11
298    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
299    vld1.u32    {q8},[r12],r11
300    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
301    vld1.u32    {q9},[r12],r11
302    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
303    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
304    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
305    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
306    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
307
308
309inner_loop_16:
310
311
312    subs        r5,r5,#16
313    vmull.u8    q10,d3,d25
314
315    add         r12,#8
316    vmlsl.u8    q10,d1,d24
317
318    subeq       r14,r14,#2
319    vmlal.u8    q10,d7,d27
320
321    vld1.u32    {q0},[r4],r11               @vector load pu1_src
322    vmlsl.u8    q10,d5,d26
323
324    vld1.u32    {q1},[r4],r11
325    vmlal.u8    q10,d13,d28
326
327    vld1.u32    {q2},[r4],r11
328    vmlal.u8    q10,d17,d30
329
330    vld1.u32    {q3},[r4],r11
331    vmlsl.u8    q10,d15,d29
332
333    vld1.u32    {q6},[r4],r11
334    vmlsl.u8    q10,d19,d31
335
336    vld1.u32    {q7},[r4],r11
337    vqrshrun.s16 d8,q4,#6                   @right shift and saturating narrow result 1
338
339    vld1.u32    {q8},[r4],r11
340    vmull.u8    q5,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
341
342    vld1.u32    {q9},[r4],r11
343    vmlal.u8    q5,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
344
345    add         r4,#8
346    vmlsl.u8    q5,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
347
348    addeq       r12,r12,r9                  @increment the src pointer by 2*src_strd-wd
349    vmlsl.u8    q5,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
350
351    addeq       r4,r12,r2                   @pu1_src + src_strd
352    vqrshrun.s16 d9,q10,#6
353
354    vmlal.u8    q5,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
355
356@   and         r7, r12, #31
357    vmlsl.u8    q5,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
358
359    vmlal.u8    q5,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
360
361    vmlsl.u8    q5,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
362
363    vmull.u8    q11,d3,d25
364
365    vmlsl.u8    q11,d1,d24
366
367    vst1.8      {q4},[r1]!                  @store the result pu1_dst
368    vmlal.u8    q11,d7,d27
369
370    addeq       r1,r1,r8
371    vqrshrun.s16 d10,q5,#6                  @right shift and saturating narrow result 2
372
373@   cmp         r7, r0
374    vmlsl.u8    q11,d5,d26
375
376    pld         [r12, r2, lsl #2]
377    vmlal.u8    q11,d13,d28
378
379    pld         [r4, r2, lsl #2]
380    vmlal.u8    q11,d17,d30
381
382@   mov         r0, r7
383    vmlsl.u8    q11,d15,d29
384
385    cmp         r14,#0
386    vmlsl.u8    q11,d19,d31
387
388    beq         epilog_16
389    vld1.u32    {q0},[r12],r11              @vector load pu1_src
390    vld1.u32    {q1},[r12],r11
391    vld1.u32    {q2},[r12],r11
392    vld1.u32    {q3},[r12],r11
393    vld1.u32    {q6},[r12],r11
394    vqrshrun.s16 d11,q11,#6
395    vmull.u8    q4,d2,d25                   @mul_res = vmlal_u8(src[0_1], coeffabs_1)@
396    vld1.u32    {q7},[r12],r11
397    vmlal.u8    q4,d6,d27                   @mul_res = vmull_u8(src[0_3], coeffabs_3)@
398    vld1.u32    {q8},[r12],r11
399    vmlsl.u8    q4,d0,d24                   @mul_res = vmlsl_u8(src[0_0], coeffabs_0)@
400    vld1.u32    {q9},[r12],r11
401    vmlsl.u8    q4,d4,d26                   @mul_res = vmlsl_u8(src[0_2], coeffabs_2)@
402    vmlal.u8    q4,d12,d28                  @mul_res = vmlal_u8(src[0_4], coeffabs_4)@
403    cmp         r5,#0
404    vmlsl.u8    q4,d14,d29                  @mul_res = vmlsl_u8(src[0_5], coeffabs_5)@
405    moveq       r5,r10
406    vmlal.u8    q4,d16,d30                  @mul_res = vmlal_u8(src[0_6], coeffabs_6)@
407    vst1.8      {q5},[r6]!                  @store the result pu1_dst
408    vmlsl.u8    q4,d18,d31                  @mul_res = vmlsl_u8(src[0_7], coeffabs_7)@
409    addeq       r6,r1,r3                    @pu1_dst + dst_strd
410    b           inner_loop_16
411
412
413epilog_16:
414    vqrshrun.s16 d11,q11,#6
415    vst1.8      {q5},[r6]!                  @store the result pu1_dst
416
417    ldr         r7, [sp], #4
418    ldr         r0, [sp], #4
419    ldr         r10,[sp,#48]
420    cmp         r10,#24
421
422    beq         outer_loop8_residual
423
424
425
426end_loops1:
427
428    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
429
430
431
432
433
434
435
436
437outer_loop4_residual:
438    sub         r12,r0,#3                   @pu1_src - 3
439    mov         r1,r7
440    add         r1,#8
441    mov         r10,#4
442    add         r12,#8
443    mov         r14,#16
444    add         r8,#4
445    add         r9,#4
446
447outer_loop_4:
448    add         r6,r1,r3                    @pu1_dst + dst_strd
449    add         r4,r12,r2                   @pu1_src + src_strd
450
451    subs        r5,r10,#0                   @checks wd
452    ble         end_inner_loop_4
453
454inner_loop_4:
455    vld1.u32    {d0},[r12],r11              @vector load pu1_src
456    vld1.u32    {d1},[r12],r11
457    vld1.u32    {d2},[r12],r11
458    vld1.u32    {d3},[r12],r11
459    vld1.u32    {d4},[r12],r11
460    vld1.u32    {d5},[r12],r11
461    vld1.u32    {d6},[r12],r11
462    vld1.u32    {d7},[r12],r11
463    @add       r12,r12,#4                      @increment the input pointer
464    sub         r12,r12,#4
465    @vext.u8   d2,d0,d1,#2                     @vector extract of src[0_2]
466    @vext.u8   d3,d0,d1,#3                     @vector extract of src[0_3]
467    @vext.u8   d4,d0,d1,#4                     @vector extract of src[0_4]
468
469    @vext.u8   d5,d0,d1,#5                     @vector extract of src[0_5]
470    @vext.u8   d6,d0,d1,#6                     @vector extract of src[0_6]
471    @vext.u8   d7,d0,d1,#7                     @vector extract of src[0_7]
472    @vext.u8   d1,d0,d1,#1                     @vector extract of src[0_1]
473    vld1.u32    {d12},[r4],r11              @vector load pu1_src + src_strd
474    vld1.u32    {d13},[r4],r11
475    vzip.32     d0,d12                      @vector zip the i iteration and ii interation in single register
476    vld1.u32    {d14},[r4],r11
477    vzip.32     d1,d13
478    vld1.u32    {d15},[r4],r11
479    vzip.32     d2,d14
480    vld1.u32    {d16},[r4],r11
481    vzip.32     d3,d15
482    vld1.u32    {d17},[r4],r11
483    vzip.32     d4,d16
484    vld1.u32    {d18},[r4],r11
485    vzip.32     d5,d17
486    vld1.u32    {d19},[r4],r11
487    sub         r4,r4,#4
488    @ add       r4,r4,#4                        @increment the input pointer
489    @ vext.u8   d14,d12,d13,#2                  @vector extract of src[0_2]
490    @ vext.u8   d15,d12,d13,#3                  @vector extract of src[0_3]
491    @ vext.u8   d16,d12,d13,#4                  @vector extract of src[0_4]
492    @ vext.u8   d17,d12,d13,#5                  @vector extract of src[0_5]
493    @ vext.u8   d18,d12,d13,#6                  @vector extract of src[0_6]
494    @ vext.u8   d19,d12,d13,#7                  @vector extract of src[0_7]
495    @vext.u8   d13,d12,d13,#1                  @vector extract of src[0_1]
496
497
498
499
500
501
502
503    vzip.32     d6,d18
504    vzip.32     d7,d19
505
506    vmull.u8    q4,d1,d25                   @arithmetic operations for ii iteration in the same time
507    vmlsl.u8    q4,d0,d24
508    vmlsl.u8    q4,d2,d26
509    vmlal.u8    q4,d3,d27
510    vmlal.u8    q4,d4,d28
511    vmlsl.u8    q4,d5,d29
512    vmlal.u8    q4,d6,d30
513    vmlsl.u8    q4,d7,d31
514
515    vqrshrun.s16 d8,q4,#6                   @narrow right shift and saturating the result
516    vst1.32     {d8[0]},[r1]!               @store the i iteration result which is in upper part of the register
517    vst1.32     {d8[1]},[r6]!               @store the ii iteration result which is in lower part of the register
518    subs        r5,r5,#4                    @decrement the wd by 4
519    bgt         inner_loop_4
520
521end_inner_loop_4:
522    subs        r14,r14,#2                  @decrement the ht by 4
523    add         r12,r12,r9                  @increment the input pointer 2*src_strd-wd
524    add         r1,r1,r8                    @increment the output pointer 2*dst_strd-wd
525    bgt         outer_loop_4
526    @subs   r7,r7,#1
527    @ bgt   start_loop_count
528
529    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
530
531
532
533
534
535
536
537