1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/rotate.h"
12
13#include "libyuv/cpu_id.h"
14#include "libyuv/convert.h"
15#include "libyuv/planar_functions.h"
16#include "libyuv/row.h"
17
18#ifdef __cplusplus
19namespace libyuv {
20extern "C" {
21#endif
22
23#if !defined(YUV_DISABLE_ASM) && \
24    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
25#if defined(__APPLE__) && defined(__i386__)
26#define DECLARE_FUNCTION(name)                                                 \
27    ".text                                     \n"                             \
28    ".private_extern _" #name "                \n"                             \
29    ".align 4,0x90                             \n"                             \
30"_" #name ":                                   \n"
31#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
32#define DECLARE_FUNCTION(name)                                                 \
33    ".text                                     \n"                             \
34    ".align 4,0x90                             \n"                             \
35"_" #name ":                                   \n"
36#else
37#define DECLARE_FUNCTION(name)                                                 \
38    ".text                                     \n"                             \
39    ".align 4,0x90                             \n"                             \
40#name ":                                       \n"
41#endif
42#endif
43
44#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
45#define HAS_MIRRORROW_NEON
46void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
47#define HAS_MIRRORROW_UV_NEON
48void MirrorRowUV_NEON(const uint8* src,
49                        uint8* dst_a, uint8* dst_b,
50                        int width);
51#define HAS_TRANSPOSE_WX8_NEON
52void TransposeWx8_NEON(const uint8* src, int src_stride,
53                       uint8* dst, int dst_stride, int width);
54#define HAS_TRANSPOSE_UVWX8_NEON
55void TransposeUVWx8_NEON(const uint8* src, int src_stride,
56                         uint8* dst_a, int dst_stride_a,
57                         uint8* dst_b, int dst_stride_b,
58                         int width);
59#endif  // defined(__ARM_NEON__)
60
61#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
62#define HAS_TRANSPOSE_WX8_SSSE3
63__declspec(naked) __declspec(align(16))
64static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
65                               uint8* dst, int dst_stride, int width) {
66  __asm {
67    push      edi
68    push      esi
69    push      ebp
70    mov       eax, [esp + 12 + 4]   // src
71    mov       edi, [esp + 12 + 8]   // src_stride
72    mov       edx, [esp + 12 + 12]  // dst
73    mov       esi, [esp + 12 + 16]  // dst_stride
74    mov       ecx, [esp + 12 + 20]  // width
75
76    // Read in the data from the source pointer.
77    // First round of bit swap.
78    align      16
79 convertloop:
80    movq      xmm0, qword ptr [eax]
81    lea       ebp, [eax + 8]
82    movq      xmm1, qword ptr [eax + edi]
83    lea       eax, [eax + 2 * edi]
84    punpcklbw xmm0, xmm1
85    movq      xmm2, qword ptr [eax]
86    movdqa    xmm1, xmm0
87    palignr   xmm1, xmm1, 8
88    movq      xmm3, qword ptr [eax + edi]
89    lea       eax, [eax + 2 * edi]
90    punpcklbw xmm2, xmm3
91    movdqa    xmm3, xmm2
92    movq      xmm4, qword ptr [eax]
93    palignr   xmm3, xmm3, 8
94    movq      xmm5, qword ptr [eax + edi]
95    punpcklbw xmm4, xmm5
96    lea       eax, [eax + 2 * edi]
97    movdqa    xmm5, xmm4
98    movq      xmm6, qword ptr [eax]
99    palignr   xmm5, xmm5, 8
100    movq      xmm7, qword ptr [eax + edi]
101    punpcklbw xmm6, xmm7
102    mov       eax, ebp
103    movdqa    xmm7, xmm6
104    palignr   xmm7, xmm7, 8
105    // Second round of bit swap.
106    punpcklwd xmm0, xmm2
107    punpcklwd xmm1, xmm3
108    movdqa    xmm2, xmm0
109    movdqa    xmm3, xmm1
110    palignr   xmm2, xmm2, 8
111    palignr   xmm3, xmm3, 8
112    punpcklwd xmm4, xmm6
113    punpcklwd xmm5, xmm7
114    movdqa    xmm6, xmm4
115    movdqa    xmm7, xmm5
116    palignr   xmm6, xmm6, 8
117    palignr   xmm7, xmm7, 8
118    // Third round of bit swap.
119    // Write to the destination pointer.
120    punpckldq xmm0, xmm4
121    movq      qword ptr [edx], xmm0
122    movdqa    xmm4, xmm0
123    palignr   xmm4, xmm4, 8
124    movq      qword ptr [edx + esi], xmm4
125    lea       edx, [edx + 2 * esi]
126    punpckldq xmm2, xmm6
127    movdqa    xmm6, xmm2
128    palignr   xmm6, xmm6, 8
129    movq      qword ptr [edx], xmm2
130    punpckldq xmm1, xmm5
131    movq      qword ptr [edx + esi], xmm6
132    lea       edx, [edx + 2 * esi]
133    movdqa    xmm5, xmm1
134    movq      qword ptr [edx], xmm1
135    palignr   xmm5, xmm5, 8
136    punpckldq xmm3, xmm7
137    movq      qword ptr [edx + esi], xmm5
138    lea       edx, [edx + 2 * esi]
139    movq      qword ptr [edx], xmm3
140    movdqa    xmm7, xmm3
141    palignr   xmm7, xmm7, 8
142    sub       ecx, 8
143    movq      qword ptr [edx + esi], xmm7
144    lea       edx, [edx + 2 * esi]
145    jg        convertloop
146
147    pop       ebp
148    pop       esi
149    pop       edi
150    ret
151  }
152}
153
154#define HAS_TRANSPOSE_UVWX8_SSE2
155__declspec(naked) __declspec(align(16))
156static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
157                                uint8* dst_a, int dst_stride_a,
158                                uint8* dst_b, int dst_stride_b,
159                                int w) {
160  __asm {
161    push      ebx
162    push      esi
163    push      edi
164    push      ebp
165    mov       eax, [esp + 16 + 4]   // src
166    mov       edi, [esp + 16 + 8]   // src_stride
167    mov       edx, [esp + 16 + 12]  // dst_a
168    mov       esi, [esp + 16 + 16]  // dst_stride_a
169    mov       ebx, [esp + 16 + 20]  // dst_b
170    mov       ebp, [esp + 16 + 24]  // dst_stride_b
171    mov       ecx, esp
172    sub       esp, 4 + 16
173    and       esp, ~15
174    mov       [esp + 16], ecx
175    mov       ecx, [ecx + 16 + 28]  // w
176
177    align      16
178 convertloop:
179    // Read in the data from the source pointer.
180    // First round of bit swap.
181    movdqa    xmm0, [eax]
182    movdqa    xmm1, [eax + edi]
183    lea       eax, [eax + 2 * edi]
184    movdqa    xmm7, xmm0  // use xmm7 as temp register.
185    punpcklbw xmm0, xmm1
186    punpckhbw xmm7, xmm1
187    movdqa    xmm1, xmm7
188    movdqa    xmm2, [eax]
189    movdqa    xmm3, [eax + edi]
190    lea       eax, [eax + 2 * edi]
191    movdqa    xmm7, xmm2
192    punpcklbw xmm2, xmm3
193    punpckhbw xmm7, xmm3
194    movdqa    xmm3, xmm7
195    movdqa    xmm4, [eax]
196    movdqa    xmm5, [eax + edi]
197    lea       eax, [eax + 2 * edi]
198    movdqa    xmm7, xmm4
199    punpcklbw xmm4, xmm5
200    punpckhbw xmm7, xmm5
201    movdqa    xmm5, xmm7
202    movdqa    xmm6, [eax]
203    movdqa    xmm7, [eax + edi]
204    lea       eax, [eax + 2 * edi]
205    movdqa    [esp], xmm5  // backup xmm5
206    neg       edi
207    movdqa    xmm5, xmm6   // use xmm5 as temp register.
208    punpcklbw xmm6, xmm7
209    punpckhbw xmm5, xmm7
210    movdqa    xmm7, xmm5
211    lea       eax, [eax + 8 * edi + 16]
212    neg       edi
213    // Second round of bit swap.
214    movdqa    xmm5, xmm0
215    punpcklwd xmm0, xmm2
216    punpckhwd xmm5, xmm2
217    movdqa    xmm2, xmm5
218    movdqa    xmm5, xmm1
219    punpcklwd xmm1, xmm3
220    punpckhwd xmm5, xmm3
221    movdqa    xmm3, xmm5
222    movdqa    xmm5, xmm4
223    punpcklwd xmm4, xmm6
224    punpckhwd xmm5, xmm6
225    movdqa    xmm6, xmm5
226    movdqa    xmm5, [esp]  // restore xmm5
227    movdqa    [esp], xmm6  // backup xmm6
228    movdqa    xmm6, xmm5    // use xmm6 as temp register.
229    punpcklwd xmm5, xmm7
230    punpckhwd xmm6, xmm7
231    movdqa    xmm7, xmm6
232    // Third round of bit swap.
233    // Write to the destination pointer.
234    movdqa    xmm6, xmm0
235    punpckldq xmm0, xmm4
236    punpckhdq xmm6, xmm4
237    movdqa    xmm4, xmm6
238    movdqa    xmm6, [esp]  // restore xmm6
239    movlpd    qword ptr [edx], xmm0
240    movhpd    qword ptr [ebx], xmm0
241    movlpd    qword ptr [edx + esi], xmm4
242    lea       edx, [edx + 2 * esi]
243    movhpd    qword ptr [ebx + ebp], xmm4
244    lea       ebx, [ebx + 2 * ebp]
245    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
246    punpckldq xmm2, xmm6
247    movlpd    qword ptr [edx], xmm2
248    movhpd    qword ptr [ebx], xmm2
249    punpckhdq xmm0, xmm6
250    movlpd    qword ptr [edx + esi], xmm0
251    lea       edx, [edx + 2 * esi]
252    movhpd    qword ptr [ebx + ebp], xmm0
253    lea       ebx, [ebx + 2 * ebp]
254    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
255    punpckldq xmm1, xmm5
256    movlpd    qword ptr [edx], xmm1
257    movhpd    qword ptr [ebx], xmm1
258    punpckhdq xmm0, xmm5
259    movlpd    qword ptr [edx + esi], xmm0
260    lea       edx, [edx + 2 * esi]
261    movhpd    qword ptr [ebx + ebp], xmm0
262    lea       ebx, [ebx + 2 * ebp]
263    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
264    punpckldq xmm3, xmm7
265    movlpd    qword ptr [edx], xmm3
266    movhpd    qword ptr [ebx], xmm3
267    punpckhdq xmm0, xmm7
268    sub       ecx, 8
269    movlpd    qword ptr [edx + esi], xmm0
270    lea       edx, [edx + 2 * esi]
271    movhpd    qword ptr [ebx + ebp], xmm0
272    lea       ebx, [ebx + 2 * ebp]
273    jg        convertloop
274
275    mov       esp, [esp + 16]
276    pop       ebp
277    pop       edi
278    pop       esi
279    pop       ebx
280    ret
281  }
282}
283#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
284#define HAS_TRANSPOSE_WX8_SSSE3
285static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
286                               uint8* dst, int dst_stride, int width) {
287  asm volatile (
288    // Read in the data from the source pointer.
289    // First round of bit swap.
290    ".p2align  4                                 \n"
291  "1:                                            \n"
292    "movq       (%0),%%xmm0                      \n"
293    "movq       (%0,%3),%%xmm1                   \n"
294    "lea        (%0,%3,2),%0                     \n"
295    "punpcklbw  %%xmm1,%%xmm0                    \n"
296    "movq       (%0),%%xmm2                      \n"
297    "movdqa     %%xmm0,%%xmm1                    \n"
298    "palignr    $0x8,%%xmm1,%%xmm1               \n"
299    "movq       (%0,%3),%%xmm3                   \n"
300    "lea        (%0,%3,2),%0                     \n"
301    "punpcklbw  %%xmm3,%%xmm2                    \n"
302    "movdqa     %%xmm2,%%xmm3                    \n"
303    "movq       (%0),%%xmm4                      \n"
304    "palignr    $0x8,%%xmm3,%%xmm3               \n"
305    "movq       (%0,%3),%%xmm5                   \n"
306    "lea        (%0,%3,2),%0                     \n"
307    "punpcklbw  %%xmm5,%%xmm4                    \n"
308    "movdqa     %%xmm4,%%xmm5                    \n"
309    "movq       (%0),%%xmm6                      \n"
310    "palignr    $0x8,%%xmm5,%%xmm5               \n"
311    "movq       (%0,%3),%%xmm7                   \n"
312    "lea        (%0,%3,2),%0                     \n"
313    "punpcklbw  %%xmm7,%%xmm6                    \n"
314    "neg        %3                               \n"
315    "movdqa     %%xmm6,%%xmm7                    \n"
316    "lea        0x8(%0,%3,8),%0                  \n"
317    "palignr    $0x8,%%xmm7,%%xmm7               \n"
318    "neg        %3                               \n"
319     // Second round of bit swap.
320    "punpcklwd  %%xmm2,%%xmm0                    \n"
321    "punpcklwd  %%xmm3,%%xmm1                    \n"
322    "movdqa     %%xmm0,%%xmm2                    \n"
323    "movdqa     %%xmm1,%%xmm3                    \n"
324    "palignr    $0x8,%%xmm2,%%xmm2               \n"
325    "palignr    $0x8,%%xmm3,%%xmm3               \n"
326    "punpcklwd  %%xmm6,%%xmm4                    \n"
327    "punpcklwd  %%xmm7,%%xmm5                    \n"
328    "movdqa     %%xmm4,%%xmm6                    \n"
329    "movdqa     %%xmm5,%%xmm7                    \n"
330    "palignr    $0x8,%%xmm6,%%xmm6               \n"
331    "palignr    $0x8,%%xmm7,%%xmm7               \n"
332    // Third round of bit swap.
333    // Write to the destination pointer.
334    "punpckldq  %%xmm4,%%xmm0                    \n"
335    "movq       %%xmm0,(%1)                      \n"
336    "movdqa     %%xmm0,%%xmm4                    \n"
337    "palignr    $0x8,%%xmm4,%%xmm4               \n"
338    "movq       %%xmm4,(%1,%4)                   \n"
339    "lea        (%1,%4,2),%1                     \n"
340    "punpckldq  %%xmm6,%%xmm2                    \n"
341    "movdqa     %%xmm2,%%xmm6                    \n"
342    "movq       %%xmm2,(%1)                      \n"
343    "palignr    $0x8,%%xmm6,%%xmm6               \n"
344    "punpckldq  %%xmm5,%%xmm1                    \n"
345    "movq       %%xmm6,(%1,%4)                   \n"
346    "lea        (%1,%4,2),%1                     \n"
347    "movdqa     %%xmm1,%%xmm5                    \n"
348    "movq       %%xmm1,(%1)                      \n"
349    "palignr    $0x8,%%xmm5,%%xmm5               \n"
350    "movq       %%xmm5,(%1,%4)                   \n"
351    "lea        (%1,%4,2),%1                     \n"
352    "punpckldq  %%xmm7,%%xmm3                    \n"
353    "movq       %%xmm3,(%1)                      \n"
354    "movdqa     %%xmm3,%%xmm7                    \n"
355    "palignr    $0x8,%%xmm7,%%xmm7               \n"
356    "sub        $0x8,%2                          \n"
357    "movq       %%xmm7,(%1,%4)                   \n"
358    "lea        (%1,%4,2),%1                     \n"
359    "jg         1b                               \n"
360    : "+r"(src),    // %0
361      "+r"(dst),    // %1
362      "+r"(width)   // %2
363    : "r"(static_cast<intptr_t>(src_stride)),  // %3
364      "r"(static_cast<intptr_t>(dst_stride))   // %4
365    : "memory", "cc"
366  #if defined(__SSE2__)
367      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
368  #endif
369  );
370}
371
372#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
373#define HAS_TRANSPOSE_UVWX8_SSE2
374extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
375                                    uint8* dst_a, int dst_stride_a,
376                                    uint8* dst_b, int dst_stride_b,
377                                    int w);
378  asm (
379    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
380    "push   %ebx                               \n"
381    "push   %esi                               \n"
382    "push   %edi                               \n"
383    "push   %ebp                               \n"
384    "mov    0x14(%esp),%eax                    \n"
385    "mov    0x18(%esp),%edi                    \n"
386    "mov    0x1c(%esp),%edx                    \n"
387    "mov    0x20(%esp),%esi                    \n"
388    "mov    0x24(%esp),%ebx                    \n"
389    "mov    0x28(%esp),%ebp                    \n"
390    "mov    %esp,%ecx                          \n"
391    "sub    $0x14,%esp                         \n"
392    "and    $0xfffffff0,%esp                   \n"
393    "mov    %ecx,0x10(%esp)                    \n"
394    "mov    0x2c(%ecx),%ecx                    \n"
395
396"1:                                            \n"
397    "movdqa (%eax),%xmm0                       \n"
398    "movdqa (%eax,%edi,1),%xmm1                \n"
399    "lea    (%eax,%edi,2),%eax                 \n"
400    "movdqa %xmm0,%xmm7                        \n"
401    "punpcklbw %xmm1,%xmm0                     \n"
402    "punpckhbw %xmm1,%xmm7                     \n"
403    "movdqa %xmm7,%xmm1                        \n"
404    "movdqa (%eax),%xmm2                       \n"
405    "movdqa (%eax,%edi,1),%xmm3                \n"
406    "lea    (%eax,%edi,2),%eax                 \n"
407    "movdqa %xmm2,%xmm7                        \n"
408    "punpcklbw %xmm3,%xmm2                     \n"
409    "punpckhbw %xmm3,%xmm7                     \n"
410    "movdqa %xmm7,%xmm3                        \n"
411    "movdqa (%eax),%xmm4                       \n"
412    "movdqa (%eax,%edi,1),%xmm5                \n"
413    "lea    (%eax,%edi,2),%eax                 \n"
414    "movdqa %xmm4,%xmm7                        \n"
415    "punpcklbw %xmm5,%xmm4                     \n"
416    "punpckhbw %xmm5,%xmm7                     \n"
417    "movdqa %xmm7,%xmm5                        \n"
418    "movdqa (%eax),%xmm6                       \n"
419    "movdqa (%eax,%edi,1),%xmm7                \n"
420    "lea    (%eax,%edi,2),%eax                 \n"
421    "movdqa %xmm5,(%esp)                       \n"
422    "neg    %edi                               \n"
423    "movdqa %xmm6,%xmm5                        \n"
424    "punpcklbw %xmm7,%xmm6                     \n"
425    "punpckhbw %xmm7,%xmm5                     \n"
426    "movdqa %xmm5,%xmm7                        \n"
427    "lea    0x10(%eax,%edi,8),%eax             \n"
428    "neg    %edi                               \n"
429    "movdqa %xmm0,%xmm5                        \n"
430    "punpcklwd %xmm2,%xmm0                     \n"
431    "punpckhwd %xmm2,%xmm5                     \n"
432    "movdqa %xmm5,%xmm2                        \n"
433    "movdqa %xmm1,%xmm5                        \n"
434    "punpcklwd %xmm3,%xmm1                     \n"
435    "punpckhwd %xmm3,%xmm5                     \n"
436    "movdqa %xmm5,%xmm3                        \n"
437    "movdqa %xmm4,%xmm5                        \n"
438    "punpcklwd %xmm6,%xmm4                     \n"
439    "punpckhwd %xmm6,%xmm5                     \n"
440    "movdqa %xmm5,%xmm6                        \n"
441    "movdqa (%esp),%xmm5                       \n"
442    "movdqa %xmm6,(%esp)                       \n"
443    "movdqa %xmm5,%xmm6                        \n"
444    "punpcklwd %xmm7,%xmm5                     \n"
445    "punpckhwd %xmm7,%xmm6                     \n"
446    "movdqa %xmm6,%xmm7                        \n"
447    "movdqa %xmm0,%xmm6                        \n"
448    "punpckldq %xmm4,%xmm0                     \n"
449    "punpckhdq %xmm4,%xmm6                     \n"
450    "movdqa %xmm6,%xmm4                        \n"
451    "movdqa (%esp),%xmm6                       \n"
452    "movlpd %xmm0,(%edx)                       \n"
453    "movhpd %xmm0,(%ebx)                       \n"
454    "movlpd %xmm4,(%edx,%esi,1)                \n"
455    "lea    (%edx,%esi,2),%edx                 \n"
456    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
457    "lea    (%ebx,%ebp,2),%ebx                 \n"
458    "movdqa %xmm2,%xmm0                        \n"
459    "punpckldq %xmm6,%xmm2                     \n"
460    "movlpd %xmm2,(%edx)                       \n"
461    "movhpd %xmm2,(%ebx)                       \n"
462    "punpckhdq %xmm6,%xmm0                     \n"
463    "movlpd %xmm0,(%edx,%esi,1)                \n"
464    "lea    (%edx,%esi,2),%edx                 \n"
465    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
466    "lea    (%ebx,%ebp,2),%ebx                 \n"
467    "movdqa %xmm1,%xmm0                        \n"
468    "punpckldq %xmm5,%xmm1                     \n"
469    "movlpd %xmm1,(%edx)                       \n"
470    "movhpd %xmm1,(%ebx)                       \n"
471    "punpckhdq %xmm5,%xmm0                     \n"
472    "movlpd %xmm0,(%edx,%esi,1)                \n"
473    "lea    (%edx,%esi,2),%edx                 \n"
474    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
475    "lea    (%ebx,%ebp,2),%ebx                 \n"
476    "movdqa %xmm3,%xmm0                        \n"
477    "punpckldq %xmm7,%xmm3                     \n"
478    "movlpd %xmm3,(%edx)                       \n"
479    "movhpd %xmm3,(%ebx)                       \n"
480    "punpckhdq %xmm7,%xmm0                     \n"
481    "sub    $0x8,%ecx                          \n"
482    "movlpd %xmm0,(%edx,%esi,1)                \n"
483    "lea    (%edx,%esi,2),%edx                 \n"
484    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
485    "lea    (%ebx,%ebp,2),%ebx                 \n"
486    "jg     1b                                 \n"
487    "mov    0x10(%esp),%esp                    \n"
488    "pop    %ebp                               \n"
489    "pop    %edi                               \n"
490    "pop    %esi                               \n"
491    "pop    %ebx                               \n"
492    "ret                                       \n"
493);
494#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
495// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
496#define HAS_TRANSPOSE_WX8_FAST_SSSE3
497static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
498                                    uint8* dst, int dst_stride, int width) {
499  asm volatile (
500  // Read in the data from the source pointer.
501  // First round of bit swap.
502  ".p2align  4                                 \n"
503"1:                                            \n"
504  "movdqa     (%0),%%xmm0                      \n"
505  "movdqa     (%0,%3),%%xmm1                   \n"
506  "lea        (%0,%3,2),%0                     \n"
507  "movdqa     %%xmm0,%%xmm8                    \n"
508  "punpcklbw  %%xmm1,%%xmm0                    \n"
509  "punpckhbw  %%xmm1,%%xmm8                    \n"
510  "movdqa     (%0),%%xmm2                      \n"
511  "movdqa     %%xmm0,%%xmm1                    \n"
512  "movdqa     %%xmm8,%%xmm9                    \n"
513  "palignr    $0x8,%%xmm1,%%xmm1               \n"
514  "palignr    $0x8,%%xmm9,%%xmm9               \n"
515  "movdqa     (%0,%3),%%xmm3                   \n"
516  "lea        (%0,%3,2),%0                     \n"
517  "movdqa     %%xmm2,%%xmm10                   \n"
518  "punpcklbw  %%xmm3,%%xmm2                    \n"
519  "punpckhbw  %%xmm3,%%xmm10                   \n"
520  "movdqa     %%xmm2,%%xmm3                    \n"
521  "movdqa     %%xmm10,%%xmm11                  \n"
522  "movdqa     (%0),%%xmm4                      \n"
523  "palignr    $0x8,%%xmm3,%%xmm3               \n"
524  "palignr    $0x8,%%xmm11,%%xmm11             \n"
525  "movdqa     (%0,%3),%%xmm5                   \n"
526  "lea        (%0,%3,2),%0                     \n"
527  "movdqa     %%xmm4,%%xmm12                   \n"
528  "punpcklbw  %%xmm5,%%xmm4                    \n"
529  "punpckhbw  %%xmm5,%%xmm12                   \n"
530  "movdqa     %%xmm4,%%xmm5                    \n"
531  "movdqa     %%xmm12,%%xmm13                  \n"
532  "movdqa     (%0),%%xmm6                      \n"
533  "palignr    $0x8,%%xmm5,%%xmm5               \n"
534  "palignr    $0x8,%%xmm13,%%xmm13             \n"
535  "movdqa     (%0,%3),%%xmm7                   \n"
536  "lea        (%0,%3,2),%0                     \n"
537  "movdqa     %%xmm6,%%xmm14                   \n"
538  "punpcklbw  %%xmm7,%%xmm6                    \n"
539  "punpckhbw  %%xmm7,%%xmm14                   \n"
540  "neg        %3                               \n"
541  "movdqa     %%xmm6,%%xmm7                    \n"
542  "movdqa     %%xmm14,%%xmm15                  \n"
543  "lea        0x10(%0,%3,8),%0                 \n"
544  "palignr    $0x8,%%xmm7,%%xmm7               \n"
545  "palignr    $0x8,%%xmm15,%%xmm15             \n"
546  "neg        %3                               \n"
547   // Second round of bit swap.
548  "punpcklwd  %%xmm2,%%xmm0                    \n"
549  "punpcklwd  %%xmm3,%%xmm1                    \n"
550  "movdqa     %%xmm0,%%xmm2                    \n"
551  "movdqa     %%xmm1,%%xmm3                    \n"
552  "palignr    $0x8,%%xmm2,%%xmm2               \n"
553  "palignr    $0x8,%%xmm3,%%xmm3               \n"
554  "punpcklwd  %%xmm6,%%xmm4                    \n"
555  "punpcklwd  %%xmm7,%%xmm5                    \n"
556  "movdqa     %%xmm4,%%xmm6                    \n"
557  "movdqa     %%xmm5,%%xmm7                    \n"
558  "palignr    $0x8,%%xmm6,%%xmm6               \n"
559  "palignr    $0x8,%%xmm7,%%xmm7               \n"
560  "punpcklwd  %%xmm10,%%xmm8                   \n"
561  "punpcklwd  %%xmm11,%%xmm9                   \n"
562  "movdqa     %%xmm8,%%xmm10                   \n"
563  "movdqa     %%xmm9,%%xmm11                   \n"
564  "palignr    $0x8,%%xmm10,%%xmm10             \n"
565  "palignr    $0x8,%%xmm11,%%xmm11             \n"
566  "punpcklwd  %%xmm14,%%xmm12                  \n"
567  "punpcklwd  %%xmm15,%%xmm13                  \n"
568  "movdqa     %%xmm12,%%xmm14                  \n"
569  "movdqa     %%xmm13,%%xmm15                  \n"
570  "palignr    $0x8,%%xmm14,%%xmm14             \n"
571  "palignr    $0x8,%%xmm15,%%xmm15             \n"
572  // Third round of bit swap.
573  // Write to the destination pointer.
574  "punpckldq  %%xmm4,%%xmm0                    \n"
575  "movq       %%xmm0,(%1)                      \n"
576  "movdqa     %%xmm0,%%xmm4                    \n"
577  "palignr    $0x8,%%xmm4,%%xmm4               \n"
578  "movq       %%xmm4,(%1,%4)                   \n"
579  "lea        (%1,%4,2),%1                     \n"
580  "punpckldq  %%xmm6,%%xmm2                    \n"
581  "movdqa     %%xmm2,%%xmm6                    \n"
582  "movq       %%xmm2,(%1)                      \n"
583  "palignr    $0x8,%%xmm6,%%xmm6               \n"
584  "punpckldq  %%xmm5,%%xmm1                    \n"
585  "movq       %%xmm6,(%1,%4)                   \n"
586  "lea        (%1,%4,2),%1                     \n"
587  "movdqa     %%xmm1,%%xmm5                    \n"
588  "movq       %%xmm1,(%1)                      \n"
589  "palignr    $0x8,%%xmm5,%%xmm5               \n"
590  "movq       %%xmm5,(%1,%4)                   \n"
591  "lea        (%1,%4,2),%1                     \n"
592  "punpckldq  %%xmm7,%%xmm3                    \n"
593  "movq       %%xmm3,(%1)                      \n"
594  "movdqa     %%xmm3,%%xmm7                    \n"
595  "palignr    $0x8,%%xmm7,%%xmm7               \n"
596  "movq       %%xmm7,(%1,%4)                   \n"
597  "lea        (%1,%4,2),%1                     \n"
598  "punpckldq  %%xmm12,%%xmm8                   \n"
599  "movq       %%xmm8,(%1)                      \n"
600  "movdqa     %%xmm8,%%xmm12                   \n"
601  "palignr    $0x8,%%xmm12,%%xmm12             \n"
602  "movq       %%xmm12,(%1,%4)                  \n"
603  "lea        (%1,%4,2),%1                     \n"
604  "punpckldq  %%xmm14,%%xmm10                  \n"
605  "movdqa     %%xmm10,%%xmm14                  \n"
606  "movq       %%xmm10,(%1)                     \n"
607  "palignr    $0x8,%%xmm14,%%xmm14             \n"
608  "punpckldq  %%xmm13,%%xmm9                   \n"
609  "movq       %%xmm14,(%1,%4)                  \n"
610  "lea        (%1,%4,2),%1                     \n"
611  "movdqa     %%xmm9,%%xmm13                   \n"
612  "movq       %%xmm9,(%1)                      \n"
613  "palignr    $0x8,%%xmm13,%%xmm13             \n"
614  "movq       %%xmm13,(%1,%4)                  \n"
615  "lea        (%1,%4,2),%1                     \n"
616  "punpckldq  %%xmm15,%%xmm11                  \n"
617  "movq       %%xmm11,(%1)                     \n"
618  "movdqa     %%xmm11,%%xmm15                  \n"
619  "palignr    $0x8,%%xmm15,%%xmm15             \n"
620  "sub        $0x10,%2                         \n"
621  "movq       %%xmm15,(%1,%4)                  \n"
622  "lea        (%1,%4,2),%1                     \n"
623  "jg         1b                               \n"
624  : "+r"(src),    // %0
625    "+r"(dst),    // %1
626    "+r"(width)   // %2
627  : "r"(static_cast<intptr_t>(src_stride)),  // %3
628    "r"(static_cast<intptr_t>(dst_stride))   // %4
629  : "memory", "cc",
630    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
631    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
632);
633}
634
635#define HAS_TRANSPOSE_UVWX8_SSE2
636static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
637                                uint8* dst_a, int dst_stride_a,
638                                uint8* dst_b, int dst_stride_b,
639                                int w) {
640  asm volatile (
641  // Read in the data from the source pointer.
642  // First round of bit swap.
643  ".p2align  4                                 \n"
644"1:                                            \n"
645  "movdqa     (%0),%%xmm0                      \n"
646  "movdqa     (%0,%4),%%xmm1                   \n"
647  "lea        (%0,%4,2),%0                     \n"
648  "movdqa     %%xmm0,%%xmm8                    \n"
649  "punpcklbw  %%xmm1,%%xmm0                    \n"
650  "punpckhbw  %%xmm1,%%xmm8                    \n"
651  "movdqa     %%xmm8,%%xmm1                    \n"
652  "movdqa     (%0),%%xmm2                      \n"
653  "movdqa     (%0,%4),%%xmm3                   \n"
654  "lea        (%0,%4,2),%0                     \n"
655  "movdqa     %%xmm2,%%xmm8                    \n"
656  "punpcklbw  %%xmm3,%%xmm2                    \n"
657  "punpckhbw  %%xmm3,%%xmm8                    \n"
658  "movdqa     %%xmm8,%%xmm3                    \n"
659  "movdqa     (%0),%%xmm4                      \n"
660  "movdqa     (%0,%4),%%xmm5                   \n"
661  "lea        (%0,%4,2),%0                     \n"
662  "movdqa     %%xmm4,%%xmm8                    \n"
663  "punpcklbw  %%xmm5,%%xmm4                    \n"
664  "punpckhbw  %%xmm5,%%xmm8                    \n"
665  "movdqa     %%xmm8,%%xmm5                    \n"
666  "movdqa     (%0),%%xmm6                      \n"
667  "movdqa     (%0,%4),%%xmm7                   \n"
668  "lea        (%0,%4,2),%0                     \n"
669  "movdqa     %%xmm6,%%xmm8                    \n"
670  "punpcklbw  %%xmm7,%%xmm6                    \n"
671  "neg        %4                               \n"
672  "lea        0x10(%0,%4,8),%0                 \n"
673  "punpckhbw  %%xmm7,%%xmm8                    \n"
674  "movdqa     %%xmm8,%%xmm7                    \n"
675  "neg        %4                               \n"
676   // Second round of bit swap.
677  "movdqa     %%xmm0,%%xmm8                    \n"
678  "movdqa     %%xmm1,%%xmm9                    \n"
679  "punpckhwd  %%xmm2,%%xmm8                    \n"
680  "punpckhwd  %%xmm3,%%xmm9                    \n"
681  "punpcklwd  %%xmm2,%%xmm0                    \n"
682  "punpcklwd  %%xmm3,%%xmm1                    \n"
683  "movdqa     %%xmm8,%%xmm2                    \n"
684  "movdqa     %%xmm9,%%xmm3                    \n"
685  "movdqa     %%xmm4,%%xmm8                    \n"
686  "movdqa     %%xmm5,%%xmm9                    \n"
687  "punpckhwd  %%xmm6,%%xmm8                    \n"
688  "punpckhwd  %%xmm7,%%xmm9                    \n"
689  "punpcklwd  %%xmm6,%%xmm4                    \n"
690  "punpcklwd  %%xmm7,%%xmm5                    \n"
691  "movdqa     %%xmm8,%%xmm6                    \n"
692  "movdqa     %%xmm9,%%xmm7                    \n"
693  // Third round of bit swap.
694  // Write to the destination pointer.
695  "movdqa     %%xmm0,%%xmm8                    \n"
696  "punpckldq  %%xmm4,%%xmm0                    \n"
697  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
698  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
699  "punpckhdq  %%xmm4,%%xmm8                    \n"
700  "movlpd     %%xmm8,(%1,%5)                   \n"
701  "lea        (%1,%5,2),%1                     \n"
702  "movhpd     %%xmm8,(%2,%6)                   \n"
703  "lea        (%2,%6,2),%2                     \n"
704  "movdqa     %%xmm2,%%xmm8                    \n"
705  "punpckldq  %%xmm6,%%xmm2                    \n"
706  "movlpd     %%xmm2,(%1)                      \n"
707  "movhpd     %%xmm2,(%2)                      \n"
708  "punpckhdq  %%xmm6,%%xmm8                    \n"
709  "movlpd     %%xmm8,(%1,%5)                   \n"
710  "lea        (%1,%5,2),%1                     \n"
711  "movhpd     %%xmm8,(%2,%6)                   \n"
712  "lea        (%2,%6,2),%2                     \n"
713  "movdqa     %%xmm1,%%xmm8                    \n"
714  "punpckldq  %%xmm5,%%xmm1                    \n"
715  "movlpd     %%xmm1,(%1)                      \n"
716  "movhpd     %%xmm1,(%2)                      \n"
717  "punpckhdq  %%xmm5,%%xmm8                    \n"
718  "movlpd     %%xmm8,(%1,%5)                   \n"
719  "lea        (%1,%5,2),%1                     \n"
720  "movhpd     %%xmm8,(%2,%6)                   \n"
721  "lea        (%2,%6,2),%2                     \n"
722  "movdqa     %%xmm3,%%xmm8                    \n"
723  "punpckldq  %%xmm7,%%xmm3                    \n"
724  "movlpd     %%xmm3,(%1)                      \n"
725  "movhpd     %%xmm3,(%2)                      \n"
726  "punpckhdq  %%xmm7,%%xmm8                    \n"
727  "sub        $0x8,%3                          \n"
728  "movlpd     %%xmm8,(%1,%5)                   \n"
729  "lea        (%1,%5,2),%1                     \n"
730  "movhpd     %%xmm8,(%2,%6)                   \n"
731  "lea        (%2,%6,2),%2                     \n"
732  "jg         1b                               \n"
733  : "+r"(src),    // %0
734    "+r"(dst_a),  // %1
735    "+r"(dst_b),  // %2
736    "+r"(w)   // %3
737  : "r"(static_cast<intptr_t>(src_stride)),    // %4
738    "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
739    "r"(static_cast<intptr_t>(dst_stride_b))   // %6
740  : "memory", "cc",
741    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
742    "xmm8", "xmm9"
743);
744}
745#endif
746#endif
747
748static void TransposeWx8_C(const uint8* src, int src_stride,
749                           uint8* dst, int dst_stride,
750                           int width) {
751  for (int i = 0; i < width; ++i) {
752    dst[0] = src[0 * src_stride];
753    dst[1] = src[1 * src_stride];
754    dst[2] = src[2 * src_stride];
755    dst[3] = src[3 * src_stride];
756    dst[4] = src[4 * src_stride];
757    dst[5] = src[5 * src_stride];
758    dst[6] = src[6 * src_stride];
759    dst[7] = src[7 * src_stride];
760    ++src;
761    dst += dst_stride;
762  }
763}
764
765static void TransposeWxH_C(const uint8* src, int src_stride,
766                           uint8* dst, int dst_stride,
767                           int width, int height) {
768  for (int i = 0; i < width; ++i) {
769    for (int j = 0; j < height; ++j) {
770      dst[i * dst_stride + j] = src[j * src_stride + i];
771    }
772  }
773}
774
775LIBYUV_API
776void TransposePlane(const uint8* src, int src_stride,
777                    uint8* dst, int dst_stride,
778                    int width, int height) {
779  void (*TransposeWx8)(const uint8* src, int src_stride,
780                       uint8* dst, int dst_stride,
781                       int width) = TransposeWx8_C;
782#if defined(HAS_TRANSPOSE_WX8_NEON)
783  if (TestCpuFlag(kCpuHasNEON)) {
784    TransposeWx8 = TransposeWx8_NEON;
785  }
786#endif
787#if defined(HAS_TRANSPOSE_WX8_SSSE3)
788  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
789    TransposeWx8 = TransposeWx8_SSSE3;
790  }
791#endif
792#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
793  if (TestCpuFlag(kCpuHasSSSE3) &&
794      IS_ALIGNED(width, 16) &&
795      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
796    TransposeWx8 = TransposeWx8_FAST_SSSE3;
797  }
798#endif
799
800  // Work across the source in 8x8 tiles
801  int i = height;
802  while (i >= 8) {
803    TransposeWx8(src, src_stride, dst, dst_stride, width);
804    src += 8 * src_stride;    // Go down 8 rows.
805    dst += 8;                 // Move over 8 columns.
806    i -= 8;
807  }
808
809  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
810}
811
812LIBYUV_API
813void RotatePlane90(const uint8* src, int src_stride,
814                   uint8* dst, int dst_stride,
815                   int width, int height) {
816  // Rotate by 90 is a transpose with the source read
817  // from bottom to top. So set the source pointer to the end
818  // of the buffer and flip the sign of the source stride.
819  src += src_stride * (height - 1);
820  src_stride = -src_stride;
821  TransposePlane(src, src_stride, dst, dst_stride, width, height);
822}
823
824LIBYUV_API
825void RotatePlane270(const uint8* src, int src_stride,
826                    uint8* dst, int dst_stride,
827                    int width, int height) {
828  // Rotate by 270 is a transpose with the destination written
829  // from bottom to top. So set the destination pointer to the end
830  // of the buffer and flip the sign of the destination stride.
831  dst += dst_stride * (width - 1);
832  dst_stride = -dst_stride;
833  TransposePlane(src, src_stride, dst, dst_stride, width, height);
834}
835
836LIBYUV_API
837void RotatePlane180(const uint8* src, int src_stride,
838                    uint8* dst, int dst_stride,
839                    int width, int height) {
840  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
841#if defined(HAS_MIRRORROW_NEON)
842  if (TestCpuFlag(kCpuHasNEON)) {
843    MirrorRow = MirrorRow_NEON;
844  }
845#endif
846#if defined(HAS_MIRRORROW_SSE2)
847  if (TestCpuFlag(kCpuHasSSE2) &&
848      IS_ALIGNED(width, 16) &&
849      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
850      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
851    MirrorRow = MirrorRow_SSE2;
852  }
853#endif
854#if defined(HAS_MIRRORROW_SSSE3)
855  if (TestCpuFlag(kCpuHasSSSE3) &&
856      IS_ALIGNED(width, 16) &&
857      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
858      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
859    MirrorRow = MirrorRow_SSSE3;
860  }
861#endif
862  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
863#if defined(HAS_COPYROW_NEON)
864  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
865    CopyRow = CopyRow_NEON;
866  }
867#endif
868#if defined(HAS_COPYROW_X86)
869  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
870    CopyRow = CopyRow_X86;
871  }
872#endif
873#if defined(HAS_COPYROW_SSE2)
874  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
875      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
876      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
877    CopyRow = CopyRow_SSE2;
878  }
879#endif
880  if (width > kMaxStride) {
881    return;
882  }
883  // Swap first and last row and mirror the content. Uses a temporary row.
884  SIMD_ALIGNED(uint8 row[kMaxStride]);
885  const uint8* src_bot = src + src_stride * (height - 1);
886  uint8* dst_bot = dst + dst_stride * (height - 1);
887  int half_height = (height + 1) >> 1;
888  // Odd height will harmlessly mirror the middle row twice.
889  for (int y = 0; y < half_height; ++y) {
890    MirrorRow(src, row, width);  // Mirror first row into a buffer
891    src += src_stride;
892    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
893    dst += dst_stride;
894    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
895    src_bot -= src_stride;
896    dst_bot -= dst_stride;
897  }
898}
899
900static void TransposeUVWx8_C(const uint8* src, int src_stride,
901                             uint8* dst_a, int dst_stride_a,
902                             uint8* dst_b, int dst_stride_b,
903                             int width) {
904  for (int i = 0; i < width; ++i) {
905    dst_a[0] = src[0 * src_stride + 0];
906    dst_b[0] = src[0 * src_stride + 1];
907    dst_a[1] = src[1 * src_stride + 0];
908    dst_b[1] = src[1 * src_stride + 1];
909    dst_a[2] = src[2 * src_stride + 0];
910    dst_b[2] = src[2 * src_stride + 1];
911    dst_a[3] = src[3 * src_stride + 0];
912    dst_b[3] = src[3 * src_stride + 1];
913    dst_a[4] = src[4 * src_stride + 0];
914    dst_b[4] = src[4 * src_stride + 1];
915    dst_a[5] = src[5 * src_stride + 0];
916    dst_b[5] = src[5 * src_stride + 1];
917    dst_a[6] = src[6 * src_stride + 0];
918    dst_b[6] = src[6 * src_stride + 1];
919    dst_a[7] = src[7 * src_stride + 0];
920    dst_b[7] = src[7 * src_stride + 1];
921    src += 2;
922    dst_a += dst_stride_a;
923    dst_b += dst_stride_b;
924  }
925}
926
927static void TransposeUVWxH_C(const uint8* src, int src_stride,
928                             uint8* dst_a, int dst_stride_a,
929                             uint8* dst_b, int dst_stride_b,
930                             int width, int height) {
931  for (int i = 0; i < width * 2; i += 2)
932    for (int j = 0; j < height; ++j) {
933      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
934      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
935    }
936}
937
938LIBYUV_API
939void TransposeUV(const uint8* src, int src_stride,
940                 uint8* dst_a, int dst_stride_a,
941                 uint8* dst_b, int dst_stride_b,
942                 int width, int height) {
943  void (*TransposeUVWx8)(const uint8* src, int src_stride,
944                         uint8* dst_a, int dst_stride_a,
945                         uint8* dst_b, int dst_stride_b,
946                         int width) = TransposeUVWx8_C;
947#if defined(HAS_TRANSPOSE_UVWX8_NEON)
948  if (TestCpuFlag(kCpuHasNEON)) {
949    TransposeUVWx8 = TransposeUVWx8_NEON;
950  }
951#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
952  if (TestCpuFlag(kCpuHasSSE2) &&
953      IS_ALIGNED(width, 8) &&
954      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
955    TransposeUVWx8 = TransposeUVWx8_SSE2;
956  }
957#endif
958
959  // Work through the source in 8x8 tiles.
960  int i = height;
961  while (i >= 8) {
962    TransposeUVWx8(src, src_stride,
963                   dst_a, dst_stride_a,
964                   dst_b, dst_stride_b,
965                   width);
966    src += 8 * src_stride;    // Go down 8 rows.
967    dst_a += 8;               // Move over 8 columns.
968    dst_b += 8;               // Move over 8 columns.
969    i -= 8;
970  }
971
972  TransposeUVWxH_C(src, src_stride,
973                   dst_a, dst_stride_a,
974                   dst_b, dst_stride_b,
975                   width, i);
976}
977
978LIBYUV_API
979void RotateUV90(const uint8* src, int src_stride,
980                uint8* dst_a, int dst_stride_a,
981                uint8* dst_b, int dst_stride_b,
982                int width, int height) {
983  src += src_stride * (height - 1);
984  src_stride = -src_stride;
985
986  TransposeUV(src, src_stride,
987              dst_a, dst_stride_a,
988              dst_b, dst_stride_b,
989              width, height);
990}
991
992LIBYUV_API
993void RotateUV270(const uint8* src, int src_stride,
994                 uint8* dst_a, int dst_stride_a,
995                 uint8* dst_b, int dst_stride_b,
996                 int width, int height) {
997  dst_a += dst_stride_a * (width - 1);
998  dst_b += dst_stride_b * (width - 1);
999  dst_stride_a = -dst_stride_a;
1000  dst_stride_b = -dst_stride_b;
1001
1002  TransposeUV(src, src_stride,
1003              dst_a, dst_stride_a,
1004              dst_b, dst_stride_b,
1005              width, height);
1006}
1007
1008// Rotate 180 is a horizontal and vertical flip.
1009LIBYUV_API
1010void RotateUV180(const uint8* src, int src_stride,
1011                 uint8* dst_a, int dst_stride_a,
1012                 uint8* dst_b, int dst_stride_b,
1013                 int width, int height) {
1014  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1015      MirrorRowUV_C;
1016#if defined(HAS_MIRRORROW_UV_NEON)
1017  if (TestCpuFlag(kCpuHasNEON)) {
1018    MirrorRowUV = MirrorRowUV_NEON;
1019  }
1020#elif defined(HAS_MIRRORROW_UV_SSSE3)
1021  if (TestCpuFlag(kCpuHasSSSE3) &&
1022      IS_ALIGNED(width, 16) &&
1023      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1024    MirrorRowUV = MirrorRowUV_SSSE3;
1025  }
1026#endif
1027
1028  dst_a += dst_stride_a * (height - 1);
1029  dst_b += dst_stride_b * (height - 1);
1030
1031  for (int i = 0; i < height; ++i) {
1032    MirrorRowUV(src, dst_a, dst_b, width);
1033    src += src_stride;
1034    dst_a -= dst_stride_a;
1035    dst_b -= dst_stride_b;
1036  }
1037}
1038
1039LIBYUV_API
1040int I420Rotate(const uint8* src_y, int src_stride_y,
1041               const uint8* src_u, int src_stride_u,
1042               const uint8* src_v, int src_stride_v,
1043               uint8* dst_y, int dst_stride_y,
1044               uint8* dst_u, int dst_stride_u,
1045               uint8* dst_v, int dst_stride_v,
1046               int width, int height,
1047               RotationMode mode) {
1048  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1049      !dst_y || !dst_u || !dst_v) {
1050    return -1;
1051  }
1052  int halfwidth = (width + 1) >> 1;
1053  int halfheight = (height + 1) >> 1;
1054
1055  // Negative height means invert the image.
1056  if (height < 0) {
1057    height = -height;
1058    halfheight = (height + 1) >> 1;
1059    src_y = src_y + (height - 1) * src_stride_y;
1060    src_u = src_u + (halfheight - 1) * src_stride_u;
1061    src_v = src_v + (halfheight - 1) * src_stride_v;
1062    src_stride_y = -src_stride_y;
1063    src_stride_u = -src_stride_u;
1064    src_stride_v = -src_stride_v;
1065  }
1066
1067  switch (mode) {
1068    case kRotate0:
1069      // copy frame
1070      return I420Copy(src_y, src_stride_y,
1071                      src_u, src_stride_u,
1072                      src_v, src_stride_v,
1073                      dst_y, dst_stride_y,
1074                      dst_u, dst_stride_u,
1075                      dst_v, dst_stride_v,
1076                      width, height);
1077    case kRotate90:
1078      RotatePlane90(src_y, src_stride_y,
1079                    dst_y, dst_stride_y,
1080                    width, height);
1081      RotatePlane90(src_u, src_stride_u,
1082                    dst_u, dst_stride_u,
1083                    halfwidth, halfheight);
1084      RotatePlane90(src_v, src_stride_v,
1085                    dst_v, dst_stride_v,
1086                    halfwidth, halfheight);
1087      return 0;
1088    case kRotate270:
1089      RotatePlane270(src_y, src_stride_y,
1090                     dst_y, dst_stride_y,
1091                     width, height);
1092      RotatePlane270(src_u, src_stride_u,
1093                     dst_u, dst_stride_u,
1094                     halfwidth, halfheight);
1095      RotatePlane270(src_v, src_stride_v,
1096                     dst_v, dst_stride_v,
1097                     halfwidth, halfheight);
1098      return 0;
1099    case kRotate180:
1100      RotatePlane180(src_y, src_stride_y,
1101                     dst_y, dst_stride_y,
1102                     width, height);
1103      RotatePlane180(src_u, src_stride_u,
1104                     dst_u, dst_stride_u,
1105                     halfwidth, halfheight);
1106      RotatePlane180(src_v, src_stride_v,
1107                     dst_v, dst_stride_v,
1108                     halfwidth, halfheight);
1109      return 0;
1110    default:
1111      break;
1112  }
1113  return -1;
1114}
1115
1116LIBYUV_API
1117int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1118                     const uint8* src_uv, int src_stride_uv,
1119                     uint8* dst_y, int dst_stride_y,
1120                     uint8* dst_u, int dst_stride_u,
1121                     uint8* dst_v, int dst_stride_v,
1122                     int width, int height,
1123                     RotationMode mode) {
1124  if (!src_y || !src_uv || width <= 0 || height == 0 ||
1125      !dst_y || !dst_u || !dst_v) {
1126    return -1;
1127  }
1128  int halfwidth = (width + 1) >> 1;
1129  int halfheight = (height + 1) >> 1;
1130
1131  // Negative height means invert the image.
1132  if (height < 0) {
1133    height = -height;
1134    halfheight = (height + 1) >> 1;
1135    src_y = src_y + (height - 1) * src_stride_y;
1136    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1137    src_stride_y = -src_stride_y;
1138    src_stride_uv = -src_stride_uv;
1139  }
1140
1141  switch (mode) {
1142    case kRotate0:
1143      // copy frame
1144      return NV12ToI420(src_y, src_stride_y,
1145                        src_uv, src_stride_uv,
1146                        dst_y, dst_stride_y,
1147                        dst_u, dst_stride_u,
1148                        dst_v, dst_stride_v,
1149                        width, height);
1150    case kRotate90:
1151      RotatePlane90(src_y, src_stride_y,
1152                    dst_y, dst_stride_y,
1153                    width, height);
1154      RotateUV90(src_uv, src_stride_uv,
1155                 dst_u, dst_stride_u,
1156                 dst_v, dst_stride_v,
1157                 halfwidth, halfheight);
1158      return 0;
1159    case kRotate270:
1160      RotatePlane270(src_y, src_stride_y,
1161                     dst_y, dst_stride_y,
1162                     width, height);
1163      RotateUV270(src_uv, src_stride_uv,
1164                  dst_u, dst_stride_u,
1165                  dst_v, dst_stride_v,
1166                  halfwidth, halfheight);
1167      return 0;
1168    case kRotate180:
1169      RotatePlane180(src_y, src_stride_y,
1170                     dst_y, dst_stride_y,
1171                     width, height);
1172      RotateUV180(src_uv, src_stride_uv,
1173                  dst_u, dst_stride_u,
1174                  dst_v, dst_stride_v,
1175                  halfwidth, halfheight);
1176      return 0;
1177    default:
1178      break;
1179  }
1180  return -1;
1181}
1182
1183#ifdef __cplusplus
1184}  // extern "C"
1185}  // namespace libyuv
1186#endif
1187