rotate.cc revision 351958316957352685f15674ee5bd71726dc993e
1/*
2 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/planar_functions.h"
12#include "libyuv/rotate.h"
13#include "rotate_priv.h"
14
15#include "libyuv/cpu_id.h"
16
17namespace libyuv {
18
19#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
20    && !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
21#if defined(_MSC_VER)
22#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
23#else
24#define TALIGN16(t, var) t var __attribute__((aligned(16)))
25#endif
26// Shuffle table for reversing the bytes.
27extern "C" TALIGN16(const uint8, kShuffleReverse[16]) =
28  { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u };
29// Shuffle table for reversing the bytes of UV channels.
30extern "C" TALIGN16(const uint8, kShuffleReverseUV[16]) =
31  { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u };
32#endif
33
34typedef void (*reverse_uv_func)(const uint8*, uint8*, uint8*, int);
35typedef void (*reverse_func)(const uint8*, uint8*, int);
36typedef void (*rotate_uv_wx8_func)(const uint8*, int,
37                                   uint8*, int,
38                                   uint8*, int, int);
39typedef void (*rotate_uv_wxh_func)(const uint8*, int,
40                                   uint8*, int,
41                                   uint8*, int, int, int);
42typedef void (*rotate_wx8_func)(const uint8*, int, uint8*, int, int);
43typedef void (*rotate_wxh_func)(const uint8*, int, uint8*, int, int, int);
44
45#if 0 // Need to add rotate_neon.s to the build to enable this
46#ifdef __ARM_NEON__
47extern "C" {
48void RestoreRegisters_NEON(unsigned long long *restore);
49void SaveRegisters_NEON(unsigned long long *store);
50#define HAS_REVERSE_LINE_NEON
51void ReverseLine_NEON(const uint8* src, uint8* dst, int width);
52#define HAS_REVERSE_LINE_UV_NEON
53void ReverseLineUV_NEON(const uint8* src,
54                        uint8* dst_a, uint8* dst_b,
55                        int width);
56#define HAS_TRANSPOSE_WX8_NEON
57void TransposeWx8_NEON(const uint8* src, int src_stride,
58                       uint8* dst, int dst_stride, int width);
59#define HAS_TRANSPOSE_UVWX8_NEON
60void TransposeUVWx8_NEON(const uint8* src, int src_stride,
61                         uint8* dst_a, int dst_stride_a,
62                         uint8* dst_b, int dst_stride_b,
63                         int width);
64}  // extern "C"
65#endif
66#endif
67
68#if defined(WIN32) && !defined(COVERAGE_ENABLED)
69#define HAS_TRANSPOSE_WX8_SSSE3
70__declspec(naked)
71static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
72                               uint8* dst, int dst_stride, int width) {
73__asm {
74    push      edi
75    push      esi
76    push      ebp
77    mov       eax, [esp + 12 + 4]   // src
78    mov       edi, [esp + 12 + 8]   // src_stride
79    mov       edx, [esp + 12 + 12]  // dst
80    mov       esi, [esp + 12 + 16]  // dst_stride
81    mov       ecx, [esp + 12 + 20]  // width
82 convertloop :
83    // Read in the data from the source pointer.
84    // First round of bit swap.
85    movq      xmm0, qword ptr [eax]
86    lea       ebp, [eax + 8]
87    movq      xmm1, qword ptr [eax + edi]
88    lea       eax, [eax + 2 * edi]
89    punpcklbw xmm0, xmm1
90    movq      xmm2, qword ptr [eax]
91    movdqa    xmm1, xmm0
92    palignr   xmm1, xmm1, 8
93    movq      xmm3, qword ptr [eax + edi]
94    lea       eax, [eax + 2 * edi]
95    punpcklbw xmm2, xmm3
96    movdqa    xmm3, xmm2
97    movq      xmm4, qword ptr [eax]
98    palignr   xmm3, xmm3, 8
99    movq      xmm5, qword ptr [eax + edi]
100    punpcklbw xmm4, xmm5
101    lea       eax, [eax + 2 * edi]
102    movdqa    xmm5, xmm4
103    movq      xmm6, qword ptr [eax]
104    palignr   xmm5, xmm5, 8
105    movq      xmm7, qword ptr [eax + edi]
106    punpcklbw xmm6, xmm7
107    mov       eax, ebp
108    movdqa    xmm7, xmm6
109    palignr   xmm7, xmm7, 8
110    // Second round of bit swap.
111    punpcklwd xmm0, xmm2
112    punpcklwd xmm1, xmm3
113    movdqa    xmm2, xmm0
114    movdqa    xmm3, xmm1
115    palignr   xmm2, xmm2, 8
116    palignr   xmm3, xmm3, 8
117    punpcklwd xmm4, xmm6
118    punpcklwd xmm5, xmm7
119    movdqa    xmm6, xmm4
120    movdqa    xmm7, xmm5
121    palignr   xmm6, xmm6, 8
122    palignr   xmm7, xmm7, 8
123    // Third round of bit swap.
124    // Write to the destination pointer.
125    punpckldq xmm0, xmm4
126    movq      qword ptr [edx], xmm0
127    movdqa    xmm4, xmm0
128    palignr   xmm4, xmm4, 8
129    movq      qword ptr [edx + esi], xmm4
130    lea       edx, [edx + 2 * esi]
131    punpckldq xmm2, xmm6
132    movdqa    xmm6, xmm2
133    palignr   xmm6, xmm6, 8
134    movq      qword ptr [edx], xmm2
135    punpckldq xmm1, xmm5
136    movq      qword ptr [edx + esi], xmm6
137    lea       edx, [edx + 2 * esi]
138    movdqa    xmm5, xmm1
139    movq      qword ptr [edx], xmm1
140    palignr   xmm5, xmm5, 8
141    punpckldq xmm3, xmm7
142    movq      qword ptr [edx + esi], xmm5
143    lea       edx, [edx + 2 * esi]
144    movq      qword ptr [edx], xmm3
145    movdqa    xmm7, xmm3
146    palignr   xmm7, xmm7, 8
147    movq      qword ptr [edx + esi], xmm7
148    lea       edx, [edx + 2 * esi]
149    sub       ecx, 8
150    ja        convertloop
151
152    pop       ebp
153    pop       esi
154    pop       edi
155    ret
156  }
157}
158
159#define HAS_TRANSPOSE_UVWX8_SSE2
160__declspec(naked)
161static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
162                                uint8* dst_a, int dst_stride_a,
163                                uint8* dst_b, int dst_stride_b,
164                                int w) {
165__asm {
166    push      ebx
167    push      esi
168    push      edi
169    push      ebp
170    mov       eax, [esp + 16 + 4]   // src
171    mov       edi, [esp + 16 + 8]   // src_stride
172    mov       edx, [esp + 16 + 12]  // dst_a
173    mov       esi, [esp + 16 + 16]  // dst_stride_a
174    mov       ebx, [esp + 16 + 20]  // dst_b
175    mov       ebp, [esp + 16 + 24]  // dst_stride_b
176    mov       ecx, esp
177    sub       esp, 4 + 16
178    and       esp, ~15
179    mov       [esp + 16], ecx
180    mov       ecx, [ecx + 16 + 28]  // w
181 convertloop :
182    // Read in the data from the source pointer.
183    // First round of bit swap.
184    movdqa    xmm0, [eax]
185    movdqa    xmm1, [eax + edi]
186    lea       eax, [eax + 2 * edi]
187    movdqa    xmm7, xmm0  // use xmm7 as temp register.
188    punpcklbw xmm0, xmm1
189    punpckhbw xmm7, xmm1
190    movdqa    xmm1, xmm7
191    movdqa    xmm2, [eax]
192    movdqa    xmm3, [eax + edi]
193    lea       eax, [eax + 2 * edi]
194    movdqa    xmm7, xmm2
195    punpcklbw xmm2, xmm3
196    punpckhbw xmm7, xmm3
197    movdqa    xmm3, xmm7
198    movdqa    xmm4, [eax]
199    movdqa    xmm5, [eax + edi]
200    lea       eax, [eax + 2 * edi]
201    movdqa    xmm7, xmm4
202    punpcklbw xmm4, xmm5
203    punpckhbw xmm7, xmm5
204    movdqa    xmm5, xmm7
205    movdqa    xmm6, [eax]
206    movdqa    xmm7, [eax + edi]
207    lea       eax, [eax + 2 * edi]
208    movdqa    [esp], xmm5  // backup xmm5
209    neg       edi
210    movdqa    xmm5, xmm6   // use xmm5 as temp register.
211    punpcklbw xmm6, xmm7
212    punpckhbw xmm5, xmm7
213    movdqa    xmm7, xmm5
214    lea       eax, [eax + 8 * edi + 16]
215    neg       edi
216    // Second round of bit swap.
217    movdqa    xmm5, xmm0
218    punpcklwd xmm0, xmm2
219    punpckhwd xmm5, xmm2
220    movdqa    xmm2, xmm5
221    movdqa    xmm5, xmm1
222    punpcklwd xmm1, xmm3
223    punpckhwd xmm5, xmm3
224    movdqa    xmm3, xmm5
225    movdqa    xmm5, xmm4
226    punpcklwd xmm4, xmm6
227    punpckhwd xmm5, xmm6
228    movdqa    xmm6, xmm5
229    movdqa    xmm5, [esp]  // restore xmm5
230    movdqa    [esp], xmm6  // backup xmm6
231    movdqa    xmm6, xmm5    // use xmm6 as temp register.
232    punpcklwd xmm5, xmm7
233    punpckhwd xmm6, xmm7
234    movdqa    xmm7, xmm6
235    // Third round of bit swap.
236    // Write to the destination pointer.
237    movdqa    xmm6, xmm0
238    punpckldq xmm0, xmm4
239    punpckhdq xmm6, xmm4
240    movdqa    xmm4, xmm6
241    movdqa    xmm6, [esp]  // restore xmm6
242    movlpd    qword ptr [edx], xmm0
243    movhpd    qword ptr [ebx], xmm0
244    movlpd    qword ptr [edx + esi], xmm4
245    lea       edx, [edx + 2 * esi]
246    movhpd    qword ptr [ebx + ebp], xmm4
247    lea       ebx, [ebx + 2 * ebp]
248    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
249    punpckldq xmm2, xmm6
250    movlpd    qword ptr [edx], xmm2
251    movhpd    qword ptr [ebx], xmm2
252    punpckhdq xmm0, xmm6
253    movlpd    qword ptr [edx + esi], xmm0
254    lea       edx, [edx + 2 * esi]
255    movhpd    qword ptr [ebx + ebp], xmm0
256    lea       ebx, [ebx + 2 * ebp]
257    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
258    punpckldq xmm1, xmm5
259    movlpd    qword ptr [edx], xmm1
260    movhpd    qword ptr [ebx], xmm1
261    punpckhdq xmm0, xmm5
262    movlpd    qword ptr [edx + esi], xmm0
263    lea       edx, [edx + 2 * esi]
264    movhpd    qword ptr [ebx + ebp], xmm0
265    lea       ebx, [ebx + 2 * ebp]
266    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
267    punpckldq xmm3, xmm7
268    movlpd    qword ptr [edx], xmm3
269    movhpd    qword ptr [ebx], xmm3
270    punpckhdq xmm0, xmm7
271    movlpd    qword ptr [edx + esi], xmm0
272    lea       edx, [edx + 2 * esi]
273    movhpd    qword ptr [ebx + ebp], xmm0
274    lea       ebx, [ebx + 2 * ebp]
275    sub       ecx, 8
276    ja        convertloop
277
278    mov       esp, [esp + 16]
279    pop       ebp
280    pop       edi
281    pop       esi
282    pop       ebx
283    ret
284  }
285}
286#elif (defined(__i386__) || defined(__x86_64__)) && \
287    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
288#define HAS_TRANSPOSE_WX8_SSSE3
289static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
290                               uint8* dst, int dst_stride, int width) {
291  asm volatile(
292"1:"
293  // Read in the data from the source pointer.
294  // First round of bit swap.
295  "movq       (%0),%%xmm0\n"
296  "movq       (%0,%3),%%xmm1\n"
297  "lea        (%0,%3,2),%0\n"
298  "punpcklbw  %%xmm1,%%xmm0\n"
299  "movq       (%0),%%xmm2\n"
300  "movdqa     %%xmm0,%%xmm1\n"
301  "palignr    $0x8,%%xmm1,%%xmm1\n"
302  "movq       (%0,%3),%%xmm3\n"
303  "lea        (%0,%3,2),%0\n"
304  "punpcklbw  %%xmm3,%%xmm2\n"
305  "movdqa     %%xmm2,%%xmm3\n"
306  "movq       (%0),%%xmm4\n"
307  "palignr    $0x8,%%xmm3,%%xmm3\n"
308  "movq       (%0,%3),%%xmm5\n"
309  "lea        (%0,%3,2),%0\n"
310  "punpcklbw  %%xmm5,%%xmm4\n"
311  "movdqa     %%xmm4,%%xmm5\n"
312  "movq       (%0),%%xmm6\n"
313  "palignr    $0x8,%%xmm5,%%xmm5\n"
314  "movq       (%0,%3),%%xmm7\n"
315  "lea        (%0,%3,2),%0\n"
316  "punpcklbw  %%xmm7,%%xmm6\n"
317  "neg        %3\n"
318  "movdqa     %%xmm6,%%xmm7\n"
319  "lea        0x8(%0,%3,8),%0\n"
320  "palignr    $0x8,%%xmm7,%%xmm7\n"
321  "neg        %3\n"
322   // Second round of bit swap.
323  "punpcklwd  %%xmm2,%%xmm0\n"
324  "punpcklwd  %%xmm3,%%xmm1\n"
325  "movdqa     %%xmm0,%%xmm2\n"
326  "movdqa     %%xmm1,%%xmm3\n"
327  "palignr    $0x8,%%xmm2,%%xmm2\n"
328  "palignr    $0x8,%%xmm3,%%xmm3\n"
329  "punpcklwd  %%xmm6,%%xmm4\n"
330  "punpcklwd  %%xmm7,%%xmm5\n"
331  "movdqa     %%xmm4,%%xmm6\n"
332  "movdqa     %%xmm5,%%xmm7\n"
333  "palignr    $0x8,%%xmm6,%%xmm6\n"
334  "palignr    $0x8,%%xmm7,%%xmm7\n"
335  // Third round of bit swap.
336  // Write to the destination pointer.
337  "punpckldq  %%xmm4,%%xmm0\n"
338  "movq       %%xmm0,(%1)\n"
339  "movdqa     %%xmm0,%%xmm4\n"
340  "palignr    $0x8,%%xmm4,%%xmm4\n"
341  "movq       %%xmm4,(%1,%4)\n"
342  "lea        (%1,%4,2),%1\n"
343  "punpckldq  %%xmm6,%%xmm2\n"
344  "movdqa     %%xmm2,%%xmm6\n"
345  "movq       %%xmm2,(%1)\n"
346  "palignr    $0x8,%%xmm6,%%xmm6\n"
347  "punpckldq  %%xmm5,%%xmm1\n"
348  "movq       %%xmm6,(%1,%4)\n"
349  "lea        (%1,%4,2),%1\n"
350  "movdqa     %%xmm1,%%xmm5\n"
351  "movq       %%xmm1,(%1)\n"
352  "palignr    $0x8,%%xmm5,%%xmm5\n"
353  "movq       %%xmm5,(%1,%4)\n"
354  "lea        (%1,%4,2),%1\n"
355  "punpckldq  %%xmm7,%%xmm3\n"
356  "movq       %%xmm3,(%1)\n"
357  "movdqa     %%xmm3,%%xmm7\n"
358  "palignr    $0x8,%%xmm7,%%xmm7\n"
359  "movq       %%xmm7,(%1,%4)\n"
360  "lea        (%1,%4,2),%1\n"
361  "sub        $0x8,%2\n"
362  "ja         1b\n"
363  : "+r"(src),    // %0
364    "+r"(dst),    // %1
365    "+r"(width)   // %2
366  : "r"(static_cast<intptr_t>(src_stride)),  // %3
367    "r"(static_cast<intptr_t>(dst_stride))   // %4
368  : "memory"
369);
370}
371
372#if defined (__i386__)
373#define HAS_TRANSPOSE_UVWX8_SSE2
374extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
375                                    uint8* dst_a, int dst_stride_a,
376                                    uint8* dst_b, int dst_stride_b,
377                                    int w);
378  asm(
379    ".text\n"
380#if defined(OSX)
381    ".globl _TransposeUVWx8_SSE2\n"
382"_TransposeUVWx8_SSE2:\n"
383#else
384    ".global TransposeUVWx8_SSE2\n"
385"TransposeUVWx8_SSE2:\n"
386#endif
387    "push   %ebx\n"
388    "push   %esi\n"
389    "push   %edi\n"
390    "push   %ebp\n"
391    "mov    0x14(%esp),%eax\n"
392    "mov    0x18(%esp),%edi\n"
393    "mov    0x1c(%esp),%edx\n"
394    "mov    0x20(%esp),%esi\n"
395    "mov    0x24(%esp),%ebx\n"
396    "mov    0x28(%esp),%ebp\n"
397    "mov    %esp,%ecx\n"
398    "sub    $0x14,%esp\n"
399    "and    $0xfffffff0,%esp\n"
400    "mov    %ecx,0x10(%esp)\n"
401    "mov    0x2c(%ecx),%ecx\n"
402
403"1:"
404    "movdqa (%eax),%xmm0\n"
405    "movdqa (%eax,%edi,1),%xmm1\n"
406    "lea    (%eax,%edi,2),%eax\n"
407    "movdqa %xmm0,%xmm7\n"
408    "punpcklbw %xmm1,%xmm0\n"
409    "punpckhbw %xmm1,%xmm7\n"
410    "movdqa %xmm7,%xmm1\n"
411    "movdqa (%eax),%xmm2\n"
412    "movdqa (%eax,%edi,1),%xmm3\n"
413    "lea    (%eax,%edi,2),%eax\n"
414    "movdqa %xmm2,%xmm7\n"
415    "punpcklbw %xmm3,%xmm2\n"
416    "punpckhbw %xmm3,%xmm7\n"
417    "movdqa %xmm7,%xmm3\n"
418    "movdqa (%eax),%xmm4\n"
419    "movdqa (%eax,%edi,1),%xmm5\n"
420    "lea    (%eax,%edi,2),%eax\n"
421    "movdqa %xmm4,%xmm7\n"
422    "punpcklbw %xmm5,%xmm4\n"
423    "punpckhbw %xmm5,%xmm7\n"
424    "movdqa %xmm7,%xmm5\n"
425    "movdqa (%eax),%xmm6\n"
426    "movdqa (%eax,%edi,1),%xmm7\n"
427    "lea    (%eax,%edi,2),%eax\n"
428    "movdqa %xmm5,(%esp)\n"
429    "neg    %edi\n"
430    "movdqa %xmm6,%xmm5\n"
431    "punpcklbw %xmm7,%xmm6\n"
432    "punpckhbw %xmm7,%xmm5\n"
433    "movdqa %xmm5,%xmm7\n"
434    "lea    0x10(%eax,%edi,8),%eax\n"
435    "neg    %edi\n"
436    "movdqa %xmm0,%xmm5\n"
437    "punpcklwd %xmm2,%xmm0\n"
438    "punpckhwd %xmm2,%xmm5\n"
439    "movdqa %xmm5,%xmm2\n"
440    "movdqa %xmm1,%xmm5\n"
441    "punpcklwd %xmm3,%xmm1\n"
442    "punpckhwd %xmm3,%xmm5\n"
443    "movdqa %xmm5,%xmm3\n"
444    "movdqa %xmm4,%xmm5\n"
445    "punpcklwd %xmm6,%xmm4\n"
446    "punpckhwd %xmm6,%xmm5\n"
447    "movdqa %xmm5,%xmm6\n"
448    "movdqa (%esp),%xmm5\n"
449    "movdqa %xmm6,(%esp)\n"
450    "movdqa %xmm5,%xmm6\n"
451    "punpcklwd %xmm7,%xmm5\n"
452    "punpckhwd %xmm7,%xmm6\n"
453    "movdqa %xmm6,%xmm7\n"
454    "movdqa %xmm0,%xmm6\n"
455    "punpckldq %xmm4,%xmm0\n"
456    "punpckhdq %xmm4,%xmm6\n"
457    "movdqa %xmm6,%xmm4\n"
458    "movdqa (%esp),%xmm6\n"
459    "movlpd %xmm0,(%edx)\n"
460    "movhpd %xmm0,(%ebx)\n"
461    "movlpd %xmm4,(%edx,%esi,1)\n"
462    "lea    (%edx,%esi,2),%edx\n"
463    "movhpd %xmm4,(%ebx,%ebp,1)\n"
464    "lea    (%ebx,%ebp,2),%ebx\n"
465    "movdqa %xmm2,%xmm0\n"
466    "punpckldq %xmm6,%xmm2\n"
467    "movlpd %xmm2,(%edx)\n"
468    "movhpd %xmm2,(%ebx)\n"
469    "punpckhdq %xmm6,%xmm0\n"
470    "movlpd %xmm0,(%edx,%esi,1)\n"
471    "lea    (%edx,%esi,2),%edx\n"
472    "movhpd %xmm0,(%ebx,%ebp,1)\n"
473    "lea    (%ebx,%ebp,2),%ebx\n"
474    "movdqa %xmm1,%xmm0\n"
475    "punpckldq %xmm5,%xmm1\n"
476    "movlpd %xmm1,(%edx)\n"
477    "movhpd %xmm1,(%ebx)\n"
478    "punpckhdq %xmm5,%xmm0\n"
479    "movlpd %xmm0,(%edx,%esi,1)\n"
480    "lea    (%edx,%esi,2),%edx\n"
481    "movhpd %xmm0,(%ebx,%ebp,1)\n"
482    "lea    (%ebx,%ebp,2),%ebx\n"
483    "movdqa %xmm3,%xmm0\n"
484    "punpckldq %xmm7,%xmm3\n"
485    "movlpd %xmm3,(%edx)\n"
486    "movhpd %xmm3,(%ebx)\n"
487    "punpckhdq %xmm7,%xmm0\n"
488    "movlpd %xmm0,(%edx,%esi,1)\n"
489    "lea    (%edx,%esi,2),%edx\n"
490    "movhpd %xmm0,(%ebx,%ebp,1)\n"
491    "lea    (%ebx,%ebp,2),%ebx\n"
492    "sub    $0x8,%ecx\n"
493    "ja     1b\n"
494    "mov    0x10(%esp),%esp\n"
495    "pop    %ebp\n"
496    "pop    %edi\n"
497    "pop    %esi\n"
498    "pop    %ebx\n"
499    "ret\n"
500);
501#elif defined (__x86_64__)
502// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
503#define HAS_TRANSPOSE_WX8_FAST_SSSE3
504static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
505                                    uint8* dst, int dst_stride, int width) {
506  asm volatile(
507"1:"
508  // Read in the data from the source pointer.
509  // First round of bit swap.
510  "movdqa     (%0),%%xmm0\n"
511  "movdqa     (%0,%3),%%xmm1\n"
512  "lea        (%0,%3,2),%0\n"
513  "movdqa     %%xmm0,%%xmm8\n"
514  "punpcklbw  %%xmm1,%%xmm0\n"
515  "punpckhbw  %%xmm1,%%xmm8\n"
516  "movdqa     (%0),%%xmm2\n"
517  "movdqa     %%xmm0,%%xmm1\n"
518  "movdqa     %%xmm8,%%xmm9\n"
519  "palignr    $0x8,%%xmm1,%%xmm1\n"
520  "palignr    $0x8,%%xmm9,%%xmm9\n"
521  "movdqa     (%0,%3),%%xmm3\n"
522  "lea        (%0,%3,2),%0\n"
523  "movdqa     %%xmm2,%%xmm10\n"
524  "punpcklbw  %%xmm3,%%xmm2\n"
525  "punpckhbw  %%xmm3,%%xmm10\n"
526  "movdqa     %%xmm2,%%xmm3\n"
527  "movdqa     %%xmm10,%%xmm11\n"
528  "movdqa     (%0),%%xmm4\n"
529  "palignr    $0x8,%%xmm3,%%xmm3\n"
530  "palignr    $0x8,%%xmm11,%%xmm11\n"
531  "movdqa     (%0,%3),%%xmm5\n"
532  "lea        (%0,%3,2),%0\n"
533  "movdqa     %%xmm4,%%xmm12\n"
534  "punpcklbw  %%xmm5,%%xmm4\n"
535  "punpckhbw  %%xmm5,%%xmm12\n"
536  "movdqa     %%xmm4,%%xmm5\n"
537  "movdqa     %%xmm12,%%xmm13\n"
538  "movdqa     (%0),%%xmm6\n"
539  "palignr    $0x8,%%xmm5,%%xmm5\n"
540  "palignr    $0x8,%%xmm13,%%xmm13\n"
541  "movdqa     (%0,%3),%%xmm7\n"
542  "lea        (%0,%3,2),%0\n"
543  "movdqa     %%xmm6,%%xmm14\n"
544  "punpcklbw  %%xmm7,%%xmm6\n"
545  "punpckhbw  %%xmm7,%%xmm14\n"
546  "neg        %3\n"
547  "movdqa     %%xmm6,%%xmm7\n"
548  "movdqa     %%xmm14,%%xmm15\n"
549  "lea        0x10(%0,%3,8),%0\n"
550  "palignr    $0x8,%%xmm7,%%xmm7\n"
551  "palignr    $0x8,%%xmm15,%%xmm15\n"
552  "neg        %3\n"
553   // Second round of bit swap.
554  "punpcklwd  %%xmm2,%%xmm0\n"
555  "punpcklwd  %%xmm3,%%xmm1\n"
556  "movdqa     %%xmm0,%%xmm2\n"
557  "movdqa     %%xmm1,%%xmm3\n"
558  "palignr    $0x8,%%xmm2,%%xmm2\n"
559  "palignr    $0x8,%%xmm3,%%xmm3\n"
560  "punpcklwd  %%xmm6,%%xmm4\n"
561  "punpcklwd  %%xmm7,%%xmm5\n"
562  "movdqa     %%xmm4,%%xmm6\n"
563  "movdqa     %%xmm5,%%xmm7\n"
564  "palignr    $0x8,%%xmm6,%%xmm6\n"
565  "palignr    $0x8,%%xmm7,%%xmm7\n"
566  "punpcklwd  %%xmm10,%%xmm8\n"
567  "punpcklwd  %%xmm11,%%xmm9\n"
568  "movdqa     %%xmm8,%%xmm10\n"
569  "movdqa     %%xmm9,%%xmm11\n"
570  "palignr    $0x8,%%xmm10,%%xmm10\n"
571  "palignr    $0x8,%%xmm11,%%xmm11\n"
572  "punpcklwd  %%xmm14,%%xmm12\n"
573  "punpcklwd  %%xmm15,%%xmm13\n"
574  "movdqa     %%xmm12,%%xmm14\n"
575  "movdqa     %%xmm13,%%xmm15\n"
576  "palignr    $0x8,%%xmm14,%%xmm14\n"
577  "palignr    $0x8,%%xmm15,%%xmm15\n"
578  // Third round of bit swap.
579  // Write to the destination pointer.
580  "punpckldq  %%xmm4,%%xmm0\n"
581  "movq       %%xmm0,(%1)\n"
582  "movdqa     %%xmm0,%%xmm4\n"
583  "palignr    $0x8,%%xmm4,%%xmm4\n"
584  "movq       %%xmm4,(%1,%4)\n"
585  "lea        (%1,%4,2),%1\n"
586  "punpckldq  %%xmm6,%%xmm2\n"
587  "movdqa     %%xmm2,%%xmm6\n"
588  "movq       %%xmm2,(%1)\n"
589  "palignr    $0x8,%%xmm6,%%xmm6\n"
590  "punpckldq  %%xmm5,%%xmm1\n"
591  "movq       %%xmm6,(%1,%4)\n"
592  "lea        (%1,%4,2),%1\n"
593  "movdqa     %%xmm1,%%xmm5\n"
594  "movq       %%xmm1,(%1)\n"
595  "palignr    $0x8,%%xmm5,%%xmm5\n"
596  "movq       %%xmm5,(%1,%4)\n"
597  "lea        (%1,%4,2),%1\n"
598  "punpckldq  %%xmm7,%%xmm3\n"
599  "movq       %%xmm3,(%1)\n"
600  "movdqa     %%xmm3,%%xmm7\n"
601  "palignr    $0x8,%%xmm7,%%xmm7\n"
602  "movq       %%xmm7,(%1,%4)\n"
603  "lea        (%1,%4,2),%1\n"
604  "punpckldq  %%xmm12,%%xmm8\n"
605  "movq       %%xmm8,(%1)\n"
606  "movdqa     %%xmm8,%%xmm12\n"
607  "palignr    $0x8,%%xmm12,%%xmm12\n"
608  "movq       %%xmm12,(%1,%4)\n"
609  "lea        (%1,%4,2),%1\n"
610  "punpckldq  %%xmm14,%%xmm10\n"
611  "movdqa     %%xmm10,%%xmm14\n"
612  "movq       %%xmm10,(%1)\n"
613  "palignr    $0x8,%%xmm14,%%xmm14\n"
614  "punpckldq  %%xmm13,%%xmm9\n"
615  "movq       %%xmm14,(%1,%4)\n"
616  "lea        (%1,%4,2),%1\n"
617  "movdqa     %%xmm9,%%xmm13\n"
618  "movq       %%xmm9,(%1)\n"
619  "palignr    $0x8,%%xmm13,%%xmm13\n"
620  "movq       %%xmm13,(%1,%4)\n"
621  "lea        (%1,%4,2),%1\n"
622  "punpckldq  %%xmm15,%%xmm11\n"
623  "movq       %%xmm11,(%1)\n"
624  "movdqa     %%xmm11,%%xmm15\n"
625  "palignr    $0x8,%%xmm15,%%xmm15\n"
626  "movq       %%xmm15,(%1,%4)\n"
627  "lea        (%1,%4,2),%1\n"
628  "sub        $0x10,%2\n"
629  "ja         1b\n"
630  : "+r"(src),    // %0
631    "+r"(dst),    // %1
632    "+r"(width)   // %2
633  : "r"(static_cast<intptr_t>(src_stride)),  // %3
634    "r"(static_cast<intptr_t>(dst_stride))   // %4
635  : "memory"
636);
637}
638
639#define HAS_TRANSPOSE_UVWX8_SSE2
640static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
641                                uint8* dst_a, int dst_stride_a,
642                                uint8* dst_b, int dst_stride_b,
643                                int w) {
644  asm volatile(
645"1:"
646  // Read in the data from the source pointer.
647  // First round of bit swap.
648  "movdqa     (%0),%%xmm0\n"
649  "movdqa     (%0,%4),%%xmm1\n"
650  "lea        (%0,%4,2),%0\n"
651  "movdqa     %%xmm0,%%xmm8\n"
652  "punpcklbw  %%xmm1,%%xmm0\n"
653  "punpckhbw  %%xmm1,%%xmm8\n"
654  "movdqa     %%xmm8,%%xmm1\n"
655  "movdqa     (%0),%%xmm2\n"
656  "movdqa     (%0,%4),%%xmm3\n"
657  "lea        (%0,%4,2),%0\n"
658  "movdqa     %%xmm2,%%xmm8\n"
659  "punpcklbw  %%xmm3,%%xmm2\n"
660  "punpckhbw  %%xmm3,%%xmm8\n"
661  "movdqa     %%xmm8,%%xmm3\n"
662  "movdqa     (%0),%%xmm4\n"
663  "movdqa     (%0,%4),%%xmm5\n"
664  "lea        (%0,%4,2),%0\n"
665  "movdqa     %%xmm4,%%xmm8\n"
666  "punpcklbw  %%xmm5,%%xmm4\n"
667  "punpckhbw  %%xmm5,%%xmm8\n"
668  "movdqa     %%xmm8,%%xmm5\n"
669  "movdqa     (%0),%%xmm6\n"
670  "movdqa     (%0,%4),%%xmm7\n"
671  "lea        (%0,%4,2),%0\n"
672  "movdqa     %%xmm6,%%xmm8\n"
673  "punpcklbw  %%xmm7,%%xmm6\n"
674  "neg        %4\n"
675  "lea        0x10(%0,%4,8),%0\n"
676  "punpckhbw  %%xmm7,%%xmm8\n"
677  "movdqa     %%xmm8,%%xmm7\n"
678  "neg        %4\n"
679   // Second round of bit swap.
680  "movdqa     %%xmm0,%%xmm8\n"
681  "movdqa     %%xmm1,%%xmm9\n"
682  "punpckhwd  %%xmm2,%%xmm8\n"
683  "punpckhwd  %%xmm3,%%xmm9\n"
684  "punpcklwd  %%xmm2,%%xmm0\n"
685  "punpcklwd  %%xmm3,%%xmm1\n"
686  "movdqa     %%xmm8,%%xmm2\n"
687  "movdqa     %%xmm9,%%xmm3\n"
688  "movdqa     %%xmm4,%%xmm8\n"
689  "movdqa     %%xmm5,%%xmm9\n"
690  "punpckhwd  %%xmm6,%%xmm8\n"
691  "punpckhwd  %%xmm7,%%xmm9\n"
692  "punpcklwd  %%xmm6,%%xmm4\n"
693  "punpcklwd  %%xmm7,%%xmm5\n"
694  "movdqa     %%xmm8,%%xmm6\n"
695  "movdqa     %%xmm9,%%xmm7\n"
696  // Third round of bit swap.
697  // Write to the destination pointer.
698  "movdqa     %%xmm0,%%xmm8\n"
699  "punpckldq  %%xmm4,%%xmm0\n"
700  "movlpd     %%xmm0,(%1)\n"  // Write back U channel
701  "movhpd     %%xmm0,(%2)\n"  // Write back V channel
702  "punpckhdq  %%xmm4,%%xmm8\n"
703  "movlpd     %%xmm8,(%1,%5)\n"
704  "lea        (%1,%5,2),%1\n"
705  "movhpd     %%xmm8,(%2,%6)\n"
706  "lea        (%2,%6,2),%2\n"
707  "movdqa     %%xmm2,%%xmm8\n"
708  "punpckldq  %%xmm6,%%xmm2\n"
709  "movlpd     %%xmm2,(%1)\n"
710  "movhpd     %%xmm2,(%2)\n"
711  "punpckhdq  %%xmm6,%%xmm8\n"
712  "movlpd     %%xmm8,(%1,%5)\n"
713  "lea        (%1,%5,2),%1\n"
714  "movhpd     %%xmm8,(%2,%6)\n"
715  "lea        (%2,%6,2),%2\n"
716  "movdqa     %%xmm1,%%xmm8\n"
717  "punpckldq  %%xmm5,%%xmm1\n"
718  "movlpd     %%xmm1,(%1)\n"
719  "movhpd     %%xmm1,(%2)\n"
720  "punpckhdq  %%xmm5,%%xmm8\n"
721  "movlpd     %%xmm8,(%1,%5)\n"
722  "lea        (%1,%5,2),%1\n"
723  "movhpd     %%xmm8,(%2,%6)\n"
724  "lea        (%2,%6,2),%2\n"
725  "movdqa     %%xmm3,%%xmm8\n"
726  "punpckldq  %%xmm7,%%xmm3\n"
727  "movlpd     %%xmm3,(%1)\n"
728  "movhpd     %%xmm3,(%2)\n"
729  "punpckhdq  %%xmm7,%%xmm8\n"
730  "movlpd     %%xmm8,(%1,%5)\n"
731  "lea        (%1,%5,2),%1\n"
732  "movhpd     %%xmm8,(%2,%6)\n"
733  "lea        (%2,%6,2),%2\n"
734  "sub        $0x8,%3\n"
735  "ja         1b\n"
736  : "+r"(src),    // %0
737    "+r"(dst_a),  // %1
738    "+r"(dst_b),  // %2
739    "+r"(w)   // %3
740  : "r"(static_cast<intptr_t>(src_stride)),    // %4
741    "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
742    "r"(static_cast<intptr_t>(dst_stride_b))   // %6
743  : "memory"
744);
745}
746#endif
747#endif
748
749static void TransposeWx8_C(const uint8* src, int src_stride,
750                           uint8* dst, int dst_stride,
751                           int w) {
752  int i;
753  for (i = 0; i < w; ++i) {
754    dst[0] = src[0 * src_stride];
755    dst[1] = src[1 * src_stride];
756    dst[2] = src[2 * src_stride];
757    dst[3] = src[3 * src_stride];
758    dst[4] = src[4 * src_stride];
759    dst[5] = src[5 * src_stride];
760    dst[6] = src[6 * src_stride];
761    dst[7] = src[7 * src_stride];
762    ++src;
763    dst += dst_stride;
764  }
765}
766
767static void TransposeWxH_C(const uint8* src, int src_stride,
768                           uint8* dst, int dst_stride,
769                           int width, int height) {
770  int i, j;
771  for (i = 0; i < width; ++i)
772    for (j = 0; j < height; ++j)
773      dst[i * dst_stride + j] = src[j * src_stride + i];
774}
775
776void TransposePlane(const uint8* src, int src_stride,
777                    uint8* dst, int dst_stride,
778                    int width, int height) {
779  int i = height;
780  rotate_wx8_func TransposeWx8;
781  rotate_wxh_func TransposeWxH;
782
783#if defined(HAS_TRANSPOSE_WX8_NEON)
784  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
785      (width % 8 == 0) &&
786      IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
787      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
788    TransposeWx8 = TransposeWx8_NEON;
789    TransposeWxH = TransposeWxH_C;
790  } else
791#endif
792#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
793  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
794      (width % 16 == 0) &&
795      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
796      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
797    TransposeWx8 = TransposeWx8_FAST_SSSE3;
798    TransposeWxH = TransposeWxH_C;
799  } else
800#endif
801#if defined(HAS_TRANSPOSE_WX8_SSSE3)
802  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
803      (width % 8 == 0) &&
804      IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
805      IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
806    TransposeWx8 = TransposeWx8_SSSE3;
807    TransposeWxH = TransposeWxH_C;
808  } else
809#endif
810  {
811    TransposeWx8 = TransposeWx8_C;
812    TransposeWxH = TransposeWxH_C;
813  }
814
815  // work across the source in 8x8 tiles
816  while (i >= 8) {
817    TransposeWx8(src, src_stride, dst, dst_stride, width);
818
819    src += 8 * src_stride;    // go down 8 rows
820    dst += 8;                 // move over 8 columns
821    i   -= 8;
822  }
823
824  TransposeWxH(src, src_stride, dst, dst_stride, width, i);
825}
826
827void RotatePlane90(const uint8* src, int src_stride,
828                   uint8* dst, int dst_stride,
829                   int width, int height) {
830  // Rotate by 90 is a transpose with the source read
831  // from bottom to top.  So set the source pointer to the end
832  // of the buffer and flip the sign of the source stride.
833  src += src_stride * (height - 1);
834  src_stride = -src_stride;
835
836  TransposePlane(src, src_stride, dst, dst_stride, width, height);
837}
838
839void RotatePlane270(const uint8* src, int src_stride,
840                    uint8* dst, int dst_stride,
841                    int width, int height) {
842  // Rotate by 270 is a transpose with the destination written
843  // from bottom to top.  So set the destination pointer to the end
844  // of the buffer and flip the sign of the destination stride.
845  dst += dst_stride * (width - 1);
846  dst_stride = -dst_stride;
847
848  TransposePlane(src, src_stride, dst, dst_stride, width, height);
849}
850
851static void ReverseLine_C(const uint8* src, uint8* dst, int width) {
852  int i;
853  src += width - 1;
854  for (i = 0; i < width; ++i) {
855    dst[i] = src[0];
856    --src;
857  }
858}
859
860#if defined(WIN32) && !defined(COVERAGE_ENABLED)
861#define HAS_REVERSE_LINE_SSSE3
862__declspec(naked)
863static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
864__asm {
865    mov       eax, [esp + 4]   // src
866    mov       edx, [esp + 8]   // dst
867    mov       ecx, [esp + 12]  // width
868    movdqa    xmm7, _kShuffleReverse
869    lea       eax, [eax + ecx - 16]
870 convertloop :
871    movdqa    xmm0, [eax]
872    lea       eax, [eax - 16]
873    pshufb    xmm0, xmm7
874    movdqa    [edx], xmm0
875    lea       edx, [edx + 16]
876    sub       ecx, 16
877    ja        convertloop
878    ret
879  }
880}
881
882#elif (defined(__i386__) || defined(__x86_64__)) && \
883    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
884#define HAS_REVERSE_LINE_SSSE3
885static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
886  intptr_t temp_width = static_cast<intptr_t>(width);
887  asm volatile(
888  "movdqa     (%3),%%xmm7\n"
889  "lea        -0x10(%0,%2,1),%0\n"
890"1:"
891  "movdqa     (%0),%%xmm0\n"
892  "lea        -0x10(%0),%0\n"
893  "pshufb     %%xmm7,%%xmm0\n"
894  "movdqa     %%xmm0,(%1)\n"
895  "lea        0x10(%1),%1\n"
896  "sub        $0x10,%2\n"
897  "ja         1b\n"
898  : "+r"(src),    // %0
899    "+r"(dst),    // %1
900    "+r"(temp_width)   // %2
901  : "r"(kShuffleReverse)   // %3
902  : "memory"
903);
904}
905#endif
906
907void RotatePlane180(const uint8* src, int src_stride,
908                    uint8* dst, int dst_stride,
909                    int width, int height) {
910  int i;
911  reverse_func ReverseLine;
912
913#if defined(HAS_REVERSE_LINE_NEON)
914  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
915      (width % 16 == 0) &&
916      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
917      IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
918    ReverseLine = ReverseLine_NEON;
919  } else
920#endif
921#if defined(HAS_REVERSE_LINE_SSSE3)
922  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
923      (width % 16 == 0) &&
924      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
925      IS_ALIGNED(dst, 16) && (dst_stride % 16 == 0)) {
926    ReverseLine = ReverseLine_SSSE3;
927  } else
928#endif
929  {
930    ReverseLine = ReverseLine_C;
931  }
932  // Rotate by 180 is a mirror and vertical flip
933  src += src_stride * (height - 1);
934
935  for (i = 0; i < height; ++i) {
936    ReverseLine(src, dst, width);
937    src -= src_stride;
938    dst += dst_stride;
939  }
940}
941
942static void TransposeUVWx8_C(const uint8* src, int src_stride,
943                             uint8* dst_a, int dst_stride_a,
944                             uint8* dst_b, int dst_stride_b,
945                             int w) {
946  int i;
947  for (i = 0; i < w; ++i) {
948    dst_a[0] = src[0 * src_stride + 0];
949    dst_b[0] = src[0 * src_stride + 1];
950    dst_a[1] = src[1 * src_stride + 0];
951    dst_b[1] = src[1 * src_stride + 1];
952    dst_a[2] = src[2 * src_stride + 0];
953    dst_b[2] = src[2 * src_stride + 1];
954    dst_a[3] = src[3 * src_stride + 0];
955    dst_b[3] = src[3 * src_stride + 1];
956    dst_a[4] = src[4 * src_stride + 0];
957    dst_b[4] = src[4 * src_stride + 1];
958    dst_a[5] = src[5 * src_stride + 0];
959    dst_b[5] = src[5 * src_stride + 1];
960    dst_a[6] = src[6 * src_stride + 0];
961    dst_b[6] = src[6 * src_stride + 1];
962    dst_a[7] = src[7 * src_stride + 0];
963    dst_b[7] = src[7 * src_stride + 1];
964    src += 2;
965    dst_a += dst_stride_a;
966    dst_b += dst_stride_b;
967  }
968}
969
970static void TransposeUVWxH_C(const uint8* src, int src_stride,
971                             uint8* dst_a, int dst_stride_a,
972                             uint8* dst_b, int dst_stride_b,
973                             int w, int h) {
974  int i, j;
975  for (i = 0; i < w * 2; i += 2)
976    for (j = 0; j < h; ++j) {
977      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
978      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
979    }
980}
981
982void TransposeUV(const uint8* src, int src_stride,
983                 uint8* dst_a, int dst_stride_a,
984                 uint8* dst_b, int dst_stride_b,
985                 int width, int height) {
986  int i = height;
987  rotate_uv_wx8_func TransposeWx8;
988  rotate_uv_wxh_func TransposeWxH;
989
990#if defined(HAS_TRANSPOSE_UVWX8_NEON)
991  unsigned long long store_reg[8];
992  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
993    SaveRegisters_NEON(store_reg);
994    TransposeWx8 = TransposeUVWx8_NEON;
995    TransposeWxH = TransposeUVWxH_C;
996  } else
997#endif
998#if defined(HAS_TRANSPOSE_UVWX8_SSE2)
999  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
1000      (width % 8 == 0) &&
1001      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
1002      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
1003      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0)) {
1004    TransposeWx8 = TransposeUVWx8_SSE2;
1005    TransposeWxH = TransposeUVWxH_C;
1006  } else
1007#endif
1008  {
1009    TransposeWx8 = TransposeUVWx8_C;
1010    TransposeWxH = TransposeUVWxH_C;
1011  }
1012
1013  // work through the source in 8x8 tiles
1014  while (i >= 8) {
1015    TransposeWx8(src, src_stride,
1016                 dst_a, dst_stride_a,
1017                 dst_b, dst_stride_b,
1018                 width);
1019
1020    src   += 8 * src_stride;    // go down 8 rows
1021    dst_a += 8;                 // move over 8 columns
1022    dst_b += 8;                 // move over 8 columns
1023    i     -= 8;
1024  }
1025
1026  TransposeWxH(src, src_stride,
1027               dst_a, dst_stride_a,
1028               dst_b, dst_stride_b,
1029               width, i);
1030
1031#if defined(HAS_TRANSPOSE_UVWX8_NEON)
1032  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON)) {
1033    RestoreRegisters_NEON(store_reg);
1034  }
1035#endif
1036}
1037
1038void RotateUV90(const uint8* src, int src_stride,
1039                uint8* dst_a, int dst_stride_a,
1040                uint8* dst_b, int dst_stride_b,
1041                int width, int height) {
1042  src += src_stride * (height - 1);
1043  src_stride = -src_stride;
1044
1045  TransposeUV(src, src_stride,
1046              dst_a, dst_stride_a,
1047              dst_b, dst_stride_b,
1048              width, height);
1049}
1050
1051void RotateUV270(const uint8* src, int src_stride,
1052                 uint8* dst_a, int dst_stride_a,
1053                 uint8* dst_b, int dst_stride_b,
1054                 int width, int height) {
1055  dst_a += dst_stride_a * (width - 1);
1056  dst_b += dst_stride_b * (width - 1);
1057  dst_stride_a = -dst_stride_a;
1058  dst_stride_b = -dst_stride_b;
1059
1060  TransposeUV(src, src_stride,
1061              dst_a, dst_stride_a,
1062              dst_b, dst_stride_b,
1063              width, height);
1064}
1065
1066#if defined(WIN32) && !defined(COVERAGE_ENABLED)
1067#define HAS_REVERSE_LINE_UV_SSSE3
1068__declspec(naked)
1069void ReverseLineUV_SSSE3(const uint8* src,
1070                         uint8* dst_a, uint8* dst_b,
1071                         int width) {
1072__asm {
1073    push      edi
1074    mov       eax, [esp + 4 + 4]   // src
1075    mov       edx, [esp + 4 + 8]   // dst_a
1076    mov       edi, [esp + 4 + 12]  // dst_b
1077    mov       ecx, [esp + 4 + 16]  // width
1078    movdqa    xmm7, _kShuffleReverseUV
1079    lea       eax, [eax + ecx * 2 - 16]
1080
1081 convertloop :
1082    movdqa    xmm0, [eax]
1083    lea       eax, [eax - 16]
1084    pshufb    xmm0, xmm7
1085    movlpd    qword ptr [edx], xmm0
1086    lea       edx, [edx + 8]
1087    movhpd    qword ptr [edi], xmm0
1088    lea       edi, [edi + 8]
1089    sub       ecx, 8
1090    ja        convertloop
1091    pop       edi
1092    ret
1093  }
1094}
1095
1096#elif (defined(__i386__) || defined(__x86_64__)) && \
1097    !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
1098#define HAS_REVERSE_LINE_UV_SSSE3
1099void ReverseLineUV_SSSE3(const uint8* src,
1100                         uint8* dst_a, uint8* dst_b,
1101                         int width) {
1102  intptr_t temp_width = static_cast<intptr_t>(width);
1103  asm volatile(
1104  "movdqa     (%4),%%xmm7\n"
1105  "lea        -0x10(%0,%3,2),%0\n"
1106"1:"
1107  "movdqa     (%0),%%xmm0\n"
1108  "lea        -0x10(%0),%0\n"
1109  "pshufb     %%xmm7,%%xmm0\n"
1110  "movlpd     %%xmm0,(%1)\n"
1111  "lea        0x8(%1),%1\n"
1112  "movhpd     %%xmm0,(%2)\n"
1113  "lea        0x8(%2),%2\n"
1114  "sub        $0x8,%3\n"
1115  "ja         1b\n"
1116  : "+r"(src),      // %0
1117    "+r"(dst_a),    // %1
1118    "+r"(dst_b),    // %2
1119    "+r"(temp_width)     // %3
1120  : "r"(kShuffleReverseUV)  // %4
1121  : "memory"
1122);
1123}
1124#endif
1125
1126static void ReverseLineUV_C(const uint8* src,
1127                            uint8* dst_a, uint8* dst_b,
1128                            int width) {
1129  int i;
1130  src += width << 1;
1131  for (i = 0; i < width; ++i) {
1132    src -= 2;
1133    dst_a[i] = src[0];
1134    dst_b[i] = src[1];
1135  }
1136}
1137
1138void RotateUV180(const uint8* src, int src_stride,
1139                 uint8* dst_a, int dst_stride_a,
1140                 uint8* dst_b, int dst_stride_b,
1141                 int width, int height) {
1142  int i;
1143  reverse_uv_func ReverseLine;
1144
1145#if defined(HAS_REVERSE_LINE_UV_NEON)
1146  if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
1147      (width % 16 == 0) &&
1148      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
1149      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
1150      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
1151    ReverseLine = ReverseLineUV_NEON;
1152  } else
1153#endif
1154#if defined(HAS_REVERSE_LINE_UV_SSSE3)
1155  if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
1156      (width % 16 == 0) &&
1157      IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
1158      IS_ALIGNED(dst_a, 8) && (dst_stride_a % 8 == 0) &&
1159      IS_ALIGNED(dst_b, 8) && (dst_stride_b % 8 == 0) ) {
1160    ReverseLine = ReverseLineUV_SSSE3;
1161  } else
1162#endif
1163  {
1164    ReverseLine = ReverseLineUV_C;
1165  }
1166
1167  dst_a += dst_stride_a * (height - 1);
1168  dst_b += dst_stride_b * (height - 1);
1169
1170  for (i = 0; i < height; ++i) {
1171    ReverseLine(src, dst_a, dst_b, width);
1172
1173    src   += src_stride;      // down one line at a time
1174    dst_a -= dst_stride_a;    // nominally up one line at a time
1175    dst_b -= dst_stride_b;    // nominally up one line at a time
1176  }
1177}
1178
1179int I420Rotate(const uint8* src_y, int src_stride_y,
1180               const uint8* src_u, int src_stride_u,
1181               const uint8* src_v, int src_stride_v,
1182               uint8* dst_y, int dst_stride_y,
1183               uint8* dst_u, int dst_stride_u,
1184               uint8* dst_v, int dst_stride_v,
1185               int width, int height,
1186               RotationMode mode) {
1187  int halfwidth = (width + 1) >> 1;
1188  int halfheight = (height + 1) >> 1;
1189
1190  // Negative height means invert the image.
1191  if (height < 0) {
1192    height = -height;
1193    halfheight = (height + 1) >> 1;
1194    src_y = src_y + (height - 1) * src_stride_y;
1195    src_u = src_u + (halfheight - 1) * src_stride_u;
1196    src_v = src_v + (halfheight - 1) * src_stride_v;
1197    src_stride_y = -src_stride_y;
1198    src_stride_u = -src_stride_u;
1199    src_stride_v = -src_stride_v;
1200  }
1201
1202  switch (mode) {
1203    case kRotate0:
1204      // copy frame
1205      return I420Copy(src_y, src_stride_y,
1206                      src_u, src_stride_u,
1207                      src_v, src_stride_v,
1208                      dst_y, dst_stride_y,
1209                      dst_u, dst_stride_u,
1210                      dst_v, dst_stride_v,
1211                      width, height);
1212    case kRotate90:
1213      RotatePlane90(src_y, src_stride_y,
1214                    dst_y, dst_stride_y,
1215                    width, height);
1216      RotatePlane90(src_u, src_stride_u,
1217                    dst_u, dst_stride_u,
1218                    halfwidth, halfheight);
1219      RotatePlane90(src_v, src_stride_v,
1220                    dst_v, dst_stride_v,
1221                    halfwidth, halfheight);
1222      return 0;
1223    case kRotate270:
1224      RotatePlane270(src_y, src_stride_y,
1225                     dst_y, dst_stride_y,
1226                     width, height);
1227      RotatePlane270(src_u, src_stride_u,
1228                     dst_u, dst_stride_u,
1229                     halfwidth, halfheight);
1230      RotatePlane270(src_v, src_stride_v,
1231                     dst_v, dst_stride_v,
1232                     halfwidth, halfheight);
1233      return 0;
1234    case kRotate180:
1235      RotatePlane180(src_y, src_stride_y,
1236                     dst_y, dst_stride_y,
1237                     width, height);
1238      RotatePlane180(src_u, src_stride_u,
1239                     dst_u, dst_stride_u,
1240                     halfwidth, halfheight);
1241      RotatePlane180(src_v, src_stride_v,
1242                     dst_v, dst_stride_v,
1243                     halfwidth, halfheight);
1244      return 0;
1245    default:
1246      break;
1247  }
1248  return -1;
1249}
1250
1251int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1252                     const uint8* src_uv, int src_stride_uv,
1253                     uint8* dst_y, int dst_stride_y,
1254                     uint8* dst_u, int dst_stride_u,
1255                     uint8* dst_v, int dst_stride_v,
1256                     int width, int height,
1257                     RotationMode mode) {
1258  int halfwidth = (width + 1) >> 1;
1259  int halfheight = (height + 1) >> 1;
1260
1261  // Negative height means invert the image.
1262  if (height < 0) {
1263    height = -height;
1264    halfheight = (height + 1) >> 1;
1265    src_y = src_y + (height - 1) * src_stride_y;
1266    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1267    src_stride_y = -src_stride_y;
1268    src_stride_uv = -src_stride_uv;
1269  }
1270
1271  switch (mode) {
1272    case kRotate0:
1273      // copy frame
1274      return NV12ToI420(src_y, src_uv, src_stride_y,
1275                        dst_y, dst_stride_y,
1276                        dst_u, dst_stride_u,
1277                        dst_v, dst_stride_v,
1278                        width, height);
1279    case kRotate90:
1280      RotatePlane90(src_y, src_stride_y,
1281                    dst_y, dst_stride_y,
1282                    width, height);
1283      RotateUV90(src_uv, src_stride_uv,
1284                 dst_u, dst_stride_u,
1285                 dst_v, dst_stride_v,
1286                 halfwidth, halfheight);
1287      return 0;
1288    case kRotate270:
1289      RotatePlane270(src_y, src_stride_y,
1290                     dst_y, dst_stride_y,
1291                     width, height);
1292      RotateUV270(src_uv, src_stride_uv,
1293                  dst_u, dst_stride_u,
1294                  dst_v, dst_stride_v,
1295                  halfwidth, halfheight);
1296      return 0;
1297    case kRotate180:
1298      RotatePlane180(src_y, src_stride_y,
1299                     dst_y, dst_stride_y,
1300                     width, height);
1301      RotateUV180(src_uv, src_stride_uv,
1302                  dst_u, dst_stride_u,
1303                  dst_v, dst_stride_v,
1304                  halfwidth, halfheight);
1305      return 0;
1306    default:
1307      break;
1308  }
1309  return -1;
1310}
1311
1312}  // namespace libyuv
1313