rotate_neon.s revision 7cd8149e2cbad8b1ff6d481c37a4775d3c8cf2fa
1  .global RestoreRegisters_NEON
2  .global ReverseLine_NEON
3  .global ReverseLineUV_NEON
4  .global SaveRegisters_NEON
5  .global TransposeWx8_NEON
6  .global TransposeUVWx8_NEON
7  .type RestoreRegisters_NEON, function
8  .type ReverseLine_NEON, function
9  .type ReverseLineUV_NEON, function
10  .type SaveRegisters_NEON, function
11  .type TransposeWx8_NEON, function
12  .type TransposeUVWx8_NEON, function
13
14@ void ReverseLine_NEON (const uint8* src, uint8* dst, int width)
15@ r0 const uint8* src
16@ r1 uint8* dst
17@ r2 width
18ReverseLine_NEON:
19
20  @ compute where to start writing destination
21  add         r1, r2      @ dst + width
22
23  @ work on segments that are multiples of 16
24  lsrs        r3, r2, #4
25
26  @ the output is written in two block.  8 bytes followed
27  @ by another 8.  reading is done sequentially, from left to
28  @ right.  writing is done from right to left in block sizes
29  @ r1, the destination pointer is incremented after writing
30  @ the first of the two blocks.  need to subtract that 8 off
31  @ along with 16 to get the next location.
32  mov         r3, #-24
33
34  beq         Lline_residuals
35
36  @ back of destination by the size of the register that is
37  @ going to be reversed
38  sub         r1, #16
39
40  @ the loop needs to run on blocks of 16.  what will be left
41  @ over is either a negative number, the residuals that need
42  @ to be done, or 0.  if this isn't subtracted off here the
43  @ loop will run one extra time.
44  sub         r2, #16
45
46Lsegments_of_16:
47    vld1.8      {q0}, [r0]!               @ src += 16
48
49    @ reverse the bytes in the 64 bit segments.  unable to reverse
50    @ the bytes in the entire 128 bits in one go.
51    vrev64.8    q0, q0
52
53    @ because of the inability to reverse the entire 128 bits
54    @ reverse the writing out of the two 64 bit segments.
55    vst1.8      {d1}, [r1]!
56    vst1.8      {d0}, [r1], r3            @ dst -= 16
57
58    subs        r2, #16
59    bge         Lsegments_of_16
60
61  @ add 16 back to the counter.  if the result is 0 there is no
62  @ residuals so return
63  adds        r2, #16
64  bxeq        lr
65
66  add         r1, #16
67
68Lline_residuals:
69
70  mov         r3, #-3
71
72  sub         r1, #2
73  subs        r2, #2
74  @ check for 16*n+1 scenarios where segments_of_2 should not
75  @ be run, but there is something left over.
76  blt         Lsegment_of_1
77
78@ do this in neon registers as per
79@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
80Lsegments_of_2:
81    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
82
83    vst1.8      {d1[0]}, [r1]!
84    vst1.8      {d0[0]}, [r1], r3         @ dst -= 2
85
86    subs        r2, #2
87    bge         Lsegments_of_2
88
89  adds        r2, #2
90  bxeq        lr
91
92Lsegment_of_1:
93  add         r1, #1
94  vld1.8      {d0[0]}, [r0]
95  vst1.8      {d0[0]}, [r1]
96
97  bx          lr
98
99@ void TransposeWx8_NEON (const uint8* src, int src_stride,
100@                         uint8* dst, int dst_stride,
101@                         int w)
102@ r0 const uint8* src
103@ r1 int src_stride
104@ r2 uint8* dst
105@ r3 int dst_stride
106@ stack int w
107TransposeWx8_NEON:
108  push        {r4,r8,r9,lr}
109
110  ldr         r8, [sp, #16]        @ width
111
112  @ loops are on blocks of 8.  loop will stop when
113  @ counter gets to or below 0.  starting the counter
114  @ at w-8 allow for this
115  sub         r8, #8
116
117@ handle 8x8 blocks.  this should be the majority of the plane
118Lloop_8x8:
119    mov         r9, r0
120
121    vld1.8      {d0}, [r9], r1
122    vld1.8      {d1}, [r9], r1
123    vld1.8      {d2}, [r9], r1
124    vld1.8      {d3}, [r9], r1
125    vld1.8      {d4}, [r9], r1
126    vld1.8      {d5}, [r9], r1
127    vld1.8      {d6}, [r9], r1
128    vld1.8      {d7}, [r9]
129
130    vtrn.8      d1, d0
131    vtrn.8      d3, d2
132    vtrn.8      d5, d4
133    vtrn.8      d7, d6
134
135    vtrn.16     d1, d3
136    vtrn.16     d0, d2
137    vtrn.16     d5, d7
138    vtrn.16     d4, d6
139
140    vtrn.32     d1, d5
141    vtrn.32     d0, d4
142    vtrn.32     d3, d7
143    vtrn.32     d2, d6
144
145    vrev16.8    q0, q0
146    vrev16.8    q1, q1
147    vrev16.8    q2, q2
148    vrev16.8    q3, q3
149
150    mov         r9, r2
151
152    vst1.8      {d1}, [r9], r3
153    vst1.8      {d0}, [r9], r3
154    vst1.8      {d3}, [r9], r3
155    vst1.8      {d2}, [r9], r3
156    vst1.8      {d5}, [r9], r3
157    vst1.8      {d4}, [r9], r3
158    vst1.8      {d7}, [r9], r3
159    vst1.8      {d6}, [r9]
160
161    add         r0, #8            @ src += 8
162    add         r2, r3, lsl #3    @ dst += 8 * dst_stride
163    subs        r8,  #8           @ w   -= 8
164    bge         Lloop_8x8
165
166  @ add 8 back to counter.  if the result is 0 there are
167  @ no residuals.
168  adds        r8, #8
169  beq         Ldone
170
171  @ some residual, so between 1 and 7 lines left to transpose
172  cmp         r8, #2
173  blt         Lblock_1x8
174
175  cmp         r8, #4
176  blt         Lblock_2x8
177
178Lblock_4x8:
179  mov         r9, r0
180  vld1.32     {d0[0]}, [r9], r1
181  vld1.32     {d0[1]}, [r9], r1
182  vld1.32     {d1[0]}, [r9], r1
183  vld1.32     {d1[1]}, [r9], r1
184  vld1.32     {d2[0]}, [r9], r1
185  vld1.32     {d2[1]}, [r9], r1
186  vld1.32     {d3[0]}, [r9], r1
187  vld1.32     {d3[1]}, [r9]
188
189  mov         r9, r2
190
191  adr         r12, vtbl_4x4_transpose
192  vld1.8      {q3}, [r12]
193
194  vtbl.8      d4, {d0, d1}, d6
195  vtbl.8      d5, {d0, d1}, d7
196  vtbl.8      d0, {d2, d3}, d6
197  vtbl.8      d1, {d2, d3}, d7
198
199  @ TODO: rework shuffle above to write
200  @       out with 4 instead of 8 writes
201  vst1.32     {d4[0]}, [r9], r3
202  vst1.32     {d4[1]}, [r9], r3
203  vst1.32     {d5[0]}, [r9], r3
204  vst1.32     {d5[1]}, [r9]
205
206  add         r9, r2, #4
207  vst1.32     {d0[0]}, [r9], r3
208  vst1.32     {d0[1]}, [r9], r3
209  vst1.32     {d1[0]}, [r9], r3
210  vst1.32     {d1[1]}, [r9]
211
212  add         r0, #4            @ src += 4
213  add         r2, r3, lsl #2    @ dst += 4 * dst_stride
214  subs        r8,  #4           @ w   -= 4
215  beq         Ldone
216
217  @ some residual, check to see if it includes a 2x8 block,
218  @ or less
219  cmp         r8, #2
220  blt         Lblock_1x8
221
222Lblock_2x8:
223  mov         r9, r0
224  vld1.16     {d0[0]}, [r9], r1
225  vld1.16     {d1[0]}, [r9], r1
226  vld1.16     {d0[1]}, [r9], r1
227  vld1.16     {d1[1]}, [r9], r1
228  vld1.16     {d0[2]}, [r9], r1
229  vld1.16     {d1[2]}, [r9], r1
230  vld1.16     {d0[3]}, [r9], r1
231  vld1.16     {d1[3]}, [r9]
232
233  vtrn.8      d0, d1
234
235  mov         r9, r2
236
237  vst1.64     {d0}, [r9], r3
238  vst1.64     {d1}, [r9]
239
240  add         r0, #2            @ src += 2
241  add         r2, r3, lsl #1    @ dst += 2 * dst_stride
242  subs        r8,  #2           @ w   -= 2
243  beq         Ldone
244
245Lblock_1x8:
246  vld1.8      {d0[0]}, [r0], r1
247  vld1.8      {d0[1]}, [r0], r1
248  vld1.8      {d0[2]}, [r0], r1
249  vld1.8      {d0[3]}, [r0], r1
250  vld1.8      {d0[4]}, [r0], r1
251  vld1.8      {d0[5]}, [r0], r1
252  vld1.8      {d0[6]}, [r0], r1
253  vld1.8      {d0[7]}, [r0]
254
255  vst1.64     {d0}, [r2]
256
257Ldone:
258
259  pop         {r4,r8,r9,pc}
260
261vtbl_4x4_transpose:
262  .byte  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
263
264@ void SaveRegisters_NEON (unsigned long long store)
265@ r0 unsigned long long store
266SaveRegisters_NEON:
267  vst1.i64    {d8, d9, d10, d11}, [r0]!
268  vst1.i64    {d12, d13, d14, d15}, [r0]!
269  bx          lr
270
271@ void RestoreRegisters_NEON (unsigned long long store)
272@ r0 unsigned long long store
273RestoreRegisters_NEON:
274  vld1.i64    {d8, d9, d10, d11}, [r0]!
275  vld1.i64    {d12, d13, d14, d15}, [r0]!
276  bx          lr
277
278@ void ReverseLineUV_NEON (const uint8* src,
279@                          uint8* dst_a,
280@                          uint8* dst_b,
281@                          int width)
282@ r0 const uint8* src
283@ r1 uint8* dst_a
284@ r2 uint8* dst_b
285@ r3 width
286ReverseLineUV_NEON:
287
288  @ compute where to start writing destination
289  add         r1, r1, r3      @ dst_a + width
290  add         r2, r2, r3      @ dst_b + width
291
292  @ work on input segments that are multiples of 16, but
293  @ width that has been passed is output segments, half
294  @ the size of input.
295  lsrs        r12, r3, #3
296
297  beq         Lline_residuals_di
298
299  @ the output is written in to two blocks.
300  mov         r12, #-8
301
302  @ back of destination by the size of the register that is
303  @ going to be reversed
304  sub         r1, r1, #8
305  sub         r2, r2, #8
306
307  @ the loop needs to run on blocks of 8.  what will be left
308  @ over is either a negative number, the residuals that need
309  @ to be done, or 0.  if this isn't subtracted off here the
310  @ loop will run one extra time.
311  sub         r3, r3, #8
312
313Lsegments_of_8_di:
314    vld2.8      {d0, d1}, [r0]!         @ src += 16
315
316    @ reverse the bytes in the 64 bit segments
317    vrev64.8    q0, q0
318
319    vst1.8      {d0}, [r1], r12         @ dst_a -= 8
320    vst1.8      {d1}, [r2], r12         @ dst_b -= 8
321
322    subs        r3, r3, #8
323    bge         Lsegments_of_8_di
324
325  @ add 8 back to the counter.  if the result is 0 there is no
326  @ residuals so return
327  adds        r3, r3, #8
328  bxeq        lr
329
330  add         r1, r1, #8
331  add         r2, r2, #8
332
333Lline_residuals_di:
334
335  mov         r12, #-1
336
337  sub         r1, r1, #1
338  sub         r2, r2, #1
339
340@ do this in neon registers as per
341@ http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
342Lsegments_of_1:
343    vld2.8      {d0[0], d1[0]}, [r0]!     @ src += 2
344
345    vst1.8      {d0[0]}, [r1], r12        @ dst_a -= 1
346    vst1.8      {d1[0]}, [r2], r12        @ dst_b -= 1
347
348    subs        r3, r3, #1
349    bgt         Lsegments_of_1
350
351  bx          lr
352
353@ void TransposeUVWx8_NEON (const uint8* src, int src_stride,
354@                           uint8* dst_a, int dst_stride_a,
355@                           uint8* dst_b, int dst_stride_b,
356@                           int width)
357@ r0 const uint8* src
358@ r1 int src_stride
359@ r2 uint8* dst_a
360@ r3 int dst_stride_a
361@ stack uint8* dst_b
362@ stack int dst_stride_b
363@ stack int width
364TransposeUVWx8_NEON:
365  push        {r4-r9,lr}
366
367  ldr         r4, [sp, #28]         @ dst_b
368  ldr         r5, [sp, #32]         @ dst_stride_b
369  ldr         r8, [sp, #36]         @ width
370  @ loops are on blocks of 8.  loop will stop when
371  @ counter gets to or below 0.  starting the counter
372  @ at w-8 allow for this
373  sub         r8, #8
374
375@ handle 8x8 blocks.  this should be the majority of the plane
376Lloop_8x8_di:
377    mov         r9, r0
378
379    vld2.8      {d0,  d1},  [r9], r1
380    vld2.8      {d2,  d3},  [r9], r1
381    vld2.8      {d4,  d5},  [r9], r1
382    vld2.8      {d6,  d7},  [r9], r1
383    vld2.8      {d8,  d9},  [r9], r1
384    vld2.8      {d10, d11}, [r9], r1
385    vld2.8      {d12, d13}, [r9], r1
386    vld2.8      {d14, d15}, [r9]
387
388    vtrn.8      q1, q0
389    vtrn.8      q3, q2
390    vtrn.8      q5, q4
391    vtrn.8      q7, q6
392
393    vtrn.16     q1, q3
394    vtrn.16     q0, q2
395    vtrn.16     q5, q7
396    vtrn.16     q4, q6
397
398    vtrn.32     q1, q5
399    vtrn.32     q0, q4
400    vtrn.32     q3, q7
401    vtrn.32     q2, q6
402
403    vrev16.8    q0, q0
404    vrev16.8    q1, q1
405    vrev16.8    q2, q2
406    vrev16.8    q3, q3
407    vrev16.8    q4, q4
408    vrev16.8    q5, q5
409    vrev16.8    q6, q6
410    vrev16.8    q7, q7
411
412    mov         r9, r2
413
414    vst1.8      {d2},  [r9], r3
415    vst1.8      {d0},  [r9], r3
416    vst1.8      {d6},  [r9], r3
417    vst1.8      {d4},  [r9], r3
418    vst1.8      {d10}, [r9], r3
419    vst1.8      {d8},  [r9], r3
420    vst1.8      {d14}, [r9], r3
421    vst1.8      {d12}, [r9]
422
423    mov         r9, r4
424
425    vst1.8      {d3},  [r9], r5
426    vst1.8      {d1},  [r9], r5
427    vst1.8      {d7},  [r9], r5
428    vst1.8      {d5},  [r9], r5
429    vst1.8      {d11}, [r9], r5
430    vst1.8      {d9},  [r9], r5
431    vst1.8      {d15}, [r9], r5
432    vst1.8      {d13}, [r9]
433
434    add         r0, #8*2          @ src   += 8*2
435    add         r2, r3, lsl #3    @ dst_a += 8 * dst_stride_a
436    add         r4, r5, lsl #3    @ dst_b += 8 * dst_stride_b
437    subs        r8,  #8           @ w     -= 8
438    bge         Lloop_8x8_di
439
440  @ add 8 back to counter.  if the result is 0 there are
441  @ no residuals.
442  adds        r8, #8
443  beq         Ldone_di
444
445  @ some residual, so between 1 and 7 lines left to transpose
446  cmp         r8, #2
447  blt         Lblock_1x8_di
448
449  cmp         r8, #4
450  blt         Lblock_2x8_di
451
452@ TODO(frkoenig) : clean this up
453Lblock_4x8_di:
454  mov         r9, r0
455  vld1.64     {d0}, [r9], r1
456  vld1.64     {d1}, [r9], r1
457  vld1.64     {d2}, [r9], r1
458  vld1.64     {d3}, [r9], r1
459  vld1.64     {d4}, [r9], r1
460  vld1.64     {d5}, [r9], r1
461  vld1.64     {d6}, [r9], r1
462  vld1.64     {d7}, [r9]
463
464  adr         r12, vtbl_4x4_transpose_di
465  vld1.8      {q7}, [r12]
466
467  vtrn.8      q0, q1
468  vtrn.8      q2, q3
469
470  vtbl.8      d8,  {d0, d1}, d14
471  vtbl.8      d9,  {d0, d1}, d15
472  vtbl.8      d10, {d2, d3}, d14
473  vtbl.8      d11, {d2, d3}, d15
474  vtbl.8      d12, {d4, d5}, d14
475  vtbl.8      d13, {d4, d5}, d15
476  vtbl.8      d0,  {d6, d7}, d14
477  vtbl.8      d1,  {d6, d7}, d15
478
479  mov         r9, r2
480
481  vst1.32     {d8[0]},  [r9], r3
482  vst1.32     {d8[1]},  [r9], r3
483  vst1.32     {d9[0]},  [r9], r3
484  vst1.32     {d9[1]},  [r9], r3
485
486  add         r9, r2, #4
487  vst1.32     {d12[0]}, [r9], r3
488  vst1.32     {d12[1]}, [r9], r3
489  vst1.32     {d13[0]}, [r9], r3
490  vst1.32     {d13[1]}, [r9]
491
492  mov         r9, r4
493
494  vst1.32     {d10[0]}, [r9], r5
495  vst1.32     {d10[1]}, [r9], r5
496  vst1.32     {d11[0]}, [r9], r5
497  vst1.32     {d11[1]}, [r9], r5
498
499  add         r9, r4, #4
500  vst1.32     {d0[0]},  [r9], r5
501  vst1.32     {d0[1]},  [r9], r5
502  vst1.32     {d1[0]},  [r9], r5
503  vst1.32     {d1[1]},  [r9]
504
505  add         r0, #4*2          @ src   += 4 * 2
506  add         r2, r3, lsl #2    @ dst_a += 4 * dst_stride_a
507  add         r4, r5, lsl #2    @ dst_b += 4 * dst_stride_b
508  subs        r8,  #4           @ w     -= 4
509  beq         Ldone_di
510
511  @ some residual, check to see if it includes a 2x8 block,
512  @ or less
513  cmp         r8, #2
514  blt         Lblock_1x8_di
515
516Lblock_2x8_di:
517  mov         r9, r0
518  vld2.16     {d0[0], d2[0]}, [r9], r1
519  vld2.16     {d1[0], d3[0]}, [r9], r1
520  vld2.16     {d0[1], d2[1]}, [r9], r1
521  vld2.16     {d1[1], d3[1]}, [r9], r1
522  vld2.16     {d0[2], d2[2]}, [r9], r1
523  vld2.16     {d1[2], d3[2]}, [r9], r1
524  vld2.16     {d0[3], d2[3]}, [r9], r1
525  vld2.16     {d1[3], d3[3]}, [r9]
526
527  vtrn.8      d0, d1
528  vtrn.8      d2, d3
529
530  mov         r9, r2
531
532  vst1.64     {d0}, [r9], r3
533  vst1.64     {d2}, [r9]
534
535  mov         r9, r4
536
537  vst1.64     {d1}, [r9], r5
538  vst1.64     {d3}, [r9]
539
540  add         r0, #2*2          @ src   += 2 * 2
541  add         r2, r3, lsl #1    @ dst_a += 2 * dst_stride_a
542  add         r4, r5, lsl #1    @ dst_a += 2 * dst_stride_a
543  subs        r8,  #2           @ w     -= 2
544  beq         Ldone_di
545
546Lblock_1x8_di:
547  vld2.8      {d0[0], d1[0]}, [r0], r1
548  vld2.8      {d0[1], d1[1]}, [r0], r1
549  vld2.8      {d0[2], d1[2]}, [r0], r1
550  vld2.8      {d0[3], d1[3]}, [r0], r1
551  vld2.8      {d0[4], d1[4]}, [r0], r1
552  vld2.8      {d0[5], d1[5]}, [r0], r1
553  vld2.8      {d0[6], d1[6]}, [r0], r1
554  vld2.8      {d0[7], d1[7]}, [r0]
555
556  vst1.64     {d0}, [r2]
557  vst1.64     {d1}, [r4]
558
559Ldone_di:
560  pop         {r4-r9, pc}
561
562vtbl_4x4_transpose_di:
563  .byte  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15
564