1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/rotate.h"
12
13#include "libyuv/cpu_id.h"
14#include "libyuv/convert.h"
15#include "libyuv/planar_functions.h"
16#include "libyuv/row.h"
17
18#ifdef __cplusplus
19namespace libyuv {
20extern "C" {
21#endif
22
23#if !defined(LIBYUV_DISABLE_X86) && \
24    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
25#if defined(__APPLE__) && defined(__i386__)
26#define DECLARE_FUNCTION(name)                                                 \
27    ".text                                     \n"                             \
28    ".private_extern _" #name "                \n"                             \
29    ".align 4,0x90                             \n"                             \
30"_" #name ":                                   \n"
31#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
32#define DECLARE_FUNCTION(name)                                                 \
33    ".text                                     \n"                             \
34    ".align 4,0x90                             \n"                             \
35"_" #name ":                                   \n"
36#else
37#define DECLARE_FUNCTION(name)                                                 \
38    ".text                                     \n"                             \
39    ".align 4,0x90                             \n"                             \
40#name ":                                       \n"
41#endif
42#endif
43
44#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
45    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
46#define HAS_MIRRORROW_NEON
47void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
48#define HAS_MIRRORROW_UV_NEON
49void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
50#define HAS_TRANSPOSE_WX8_NEON
51void TransposeWx8_NEON(const uint8* src, int src_stride,
52                       uint8* dst, int dst_stride, int width);
53#define HAS_TRANSPOSE_UVWX8_NEON
54void TransposeUVWx8_NEON(const uint8* src, int src_stride,
55                         uint8* dst_a, int dst_stride_a,
56                         uint8* dst_b, int dst_stride_b,
57                         int width);
58#endif  // defined(__ARM_NEON__)
59
60#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
61    defined(__mips__) && \
62    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
63#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
64void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
65                             uint8* dst, int dst_stride, int width);
66
67void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
68                                  uint8* dst, int dst_stride, int width);
69#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
70void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
71                               uint8* dst_a, int dst_stride_a,
72                               uint8* dst_b, int dst_stride_b,
73                               int width);
74#endif  // defined(__mips__)
75
76#if !defined(LIBYUV_DISABLE_X86) && \
77    defined(_M_IX86) && defined(_MSC_VER)
78#define HAS_TRANSPOSE_WX8_SSSE3
79__declspec(naked) __declspec(align(16))
80static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
81                               uint8* dst, int dst_stride, int width) {
82  __asm {
83    push      edi
84    push      esi
85    push      ebp
86    mov       eax, [esp + 12 + 4]   // src
87    mov       edi, [esp + 12 + 8]   // src_stride
88    mov       edx, [esp + 12 + 12]  // dst
89    mov       esi, [esp + 12 + 16]  // dst_stride
90    mov       ecx, [esp + 12 + 20]  // width
91
92    // Read in the data from the source pointer.
93    // First round of bit swap.
94    align      4
95 convertloop:
96    movq      xmm0, qword ptr [eax]
97    lea       ebp, [eax + 8]
98    movq      xmm1, qword ptr [eax + edi]
99    lea       eax, [eax + 2 * edi]
100    punpcklbw xmm0, xmm1
101    movq      xmm2, qword ptr [eax]
102    movdqa    xmm1, xmm0
103    palignr   xmm1, xmm1, 8
104    movq      xmm3, qword ptr [eax + edi]
105    lea       eax, [eax + 2 * edi]
106    punpcklbw xmm2, xmm3
107    movdqa    xmm3, xmm2
108    movq      xmm4, qword ptr [eax]
109    palignr   xmm3, xmm3, 8
110    movq      xmm5, qword ptr [eax + edi]
111    punpcklbw xmm4, xmm5
112    lea       eax, [eax + 2 * edi]
113    movdqa    xmm5, xmm4
114    movq      xmm6, qword ptr [eax]
115    palignr   xmm5, xmm5, 8
116    movq      xmm7, qword ptr [eax + edi]
117    punpcklbw xmm6, xmm7
118    mov       eax, ebp
119    movdqa    xmm7, xmm6
120    palignr   xmm7, xmm7, 8
121    // Second round of bit swap.
122    punpcklwd xmm0, xmm2
123    punpcklwd xmm1, xmm3
124    movdqa    xmm2, xmm0
125    movdqa    xmm3, xmm1
126    palignr   xmm2, xmm2, 8
127    palignr   xmm3, xmm3, 8
128    punpcklwd xmm4, xmm6
129    punpcklwd xmm5, xmm7
130    movdqa    xmm6, xmm4
131    movdqa    xmm7, xmm5
132    palignr   xmm6, xmm6, 8
133    palignr   xmm7, xmm7, 8
134    // Third round of bit swap.
135    // Write to the destination pointer.
136    punpckldq xmm0, xmm4
137    movq      qword ptr [edx], xmm0
138    movdqa    xmm4, xmm0
139    palignr   xmm4, xmm4, 8
140    movq      qword ptr [edx + esi], xmm4
141    lea       edx, [edx + 2 * esi]
142    punpckldq xmm2, xmm6
143    movdqa    xmm6, xmm2
144    palignr   xmm6, xmm6, 8
145    movq      qword ptr [edx], xmm2
146    punpckldq xmm1, xmm5
147    movq      qword ptr [edx + esi], xmm6
148    lea       edx, [edx + 2 * esi]
149    movdqa    xmm5, xmm1
150    movq      qword ptr [edx], xmm1
151    palignr   xmm5, xmm5, 8
152    punpckldq xmm3, xmm7
153    movq      qword ptr [edx + esi], xmm5
154    lea       edx, [edx + 2 * esi]
155    movq      qword ptr [edx], xmm3
156    movdqa    xmm7, xmm3
157    palignr   xmm7, xmm7, 8
158    sub       ecx, 8
159    movq      qword ptr [edx + esi], xmm7
160    lea       edx, [edx + 2 * esi]
161    jg        convertloop
162
163    pop       ebp
164    pop       esi
165    pop       edi
166    ret
167  }
168}
169
170#define HAS_TRANSPOSE_UVWX8_SSE2
171__declspec(naked) __declspec(align(16))
172static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
173                                uint8* dst_a, int dst_stride_a,
174                                uint8* dst_b, int dst_stride_b,
175                                int w) {
176  __asm {
177    push      ebx
178    push      esi
179    push      edi
180    push      ebp
181    mov       eax, [esp + 16 + 4]   // src
182    mov       edi, [esp + 16 + 8]   // src_stride
183    mov       edx, [esp + 16 + 12]  // dst_a
184    mov       esi, [esp + 16 + 16]  // dst_stride_a
185    mov       ebx, [esp + 16 + 20]  // dst_b
186    mov       ebp, [esp + 16 + 24]  // dst_stride_b
187    mov       ecx, esp
188    sub       esp, 4 + 16
189    and       esp, ~15
190    mov       [esp + 16], ecx
191    mov       ecx, [ecx + 16 + 28]  // w
192
193    align      4
194 convertloop:
195    // Read in the data from the source pointer.
196    // First round of bit swap.
197    movdqa    xmm0, [eax]
198    movdqa    xmm1, [eax + edi]
199    lea       eax, [eax + 2 * edi]
200    movdqa    xmm7, xmm0  // use xmm7 as temp register.
201    punpcklbw xmm0, xmm1
202    punpckhbw xmm7, xmm1
203    movdqa    xmm1, xmm7
204    movdqa    xmm2, [eax]
205    movdqa    xmm3, [eax + edi]
206    lea       eax, [eax + 2 * edi]
207    movdqa    xmm7, xmm2
208    punpcklbw xmm2, xmm3
209    punpckhbw xmm7, xmm3
210    movdqa    xmm3, xmm7
211    movdqa    xmm4, [eax]
212    movdqa    xmm5, [eax + edi]
213    lea       eax, [eax + 2 * edi]
214    movdqa    xmm7, xmm4
215    punpcklbw xmm4, xmm5
216    punpckhbw xmm7, xmm5
217    movdqa    xmm5, xmm7
218    movdqa    xmm6, [eax]
219    movdqa    xmm7, [eax + edi]
220    lea       eax, [eax + 2 * edi]
221    movdqa    [esp], xmm5  // backup xmm5
222    neg       edi
223    movdqa    xmm5, xmm6   // use xmm5 as temp register.
224    punpcklbw xmm6, xmm7
225    punpckhbw xmm5, xmm7
226    movdqa    xmm7, xmm5
227    lea       eax, [eax + 8 * edi + 16]
228    neg       edi
229    // Second round of bit swap.
230    movdqa    xmm5, xmm0
231    punpcklwd xmm0, xmm2
232    punpckhwd xmm5, xmm2
233    movdqa    xmm2, xmm5
234    movdqa    xmm5, xmm1
235    punpcklwd xmm1, xmm3
236    punpckhwd xmm5, xmm3
237    movdqa    xmm3, xmm5
238    movdqa    xmm5, xmm4
239    punpcklwd xmm4, xmm6
240    punpckhwd xmm5, xmm6
241    movdqa    xmm6, xmm5
242    movdqa    xmm5, [esp]  // restore xmm5
243    movdqa    [esp], xmm6  // backup xmm6
244    movdqa    xmm6, xmm5    // use xmm6 as temp register.
245    punpcklwd xmm5, xmm7
246    punpckhwd xmm6, xmm7
247    movdqa    xmm7, xmm6
248    // Third round of bit swap.
249    // Write to the destination pointer.
250    movdqa    xmm6, xmm0
251    punpckldq xmm0, xmm4
252    punpckhdq xmm6, xmm4
253    movdqa    xmm4, xmm6
254    movdqa    xmm6, [esp]  // restore xmm6
255    movlpd    qword ptr [edx], xmm0
256    movhpd    qword ptr [ebx], xmm0
257    movlpd    qword ptr [edx + esi], xmm4
258    lea       edx, [edx + 2 * esi]
259    movhpd    qword ptr [ebx + ebp], xmm4
260    lea       ebx, [ebx + 2 * ebp]
261    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
262    punpckldq xmm2, xmm6
263    movlpd    qword ptr [edx], xmm2
264    movhpd    qword ptr [ebx], xmm2
265    punpckhdq xmm0, xmm6
266    movlpd    qword ptr [edx + esi], xmm0
267    lea       edx, [edx + 2 * esi]
268    movhpd    qword ptr [ebx + ebp], xmm0
269    lea       ebx, [ebx + 2 * ebp]
270    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
271    punpckldq xmm1, xmm5
272    movlpd    qword ptr [edx], xmm1
273    movhpd    qword ptr [ebx], xmm1
274    punpckhdq xmm0, xmm5
275    movlpd    qword ptr [edx + esi], xmm0
276    lea       edx, [edx + 2 * esi]
277    movhpd    qword ptr [ebx + ebp], xmm0
278    lea       ebx, [ebx + 2 * ebp]
279    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
280    punpckldq xmm3, xmm7
281    movlpd    qword ptr [edx], xmm3
282    movhpd    qword ptr [ebx], xmm3
283    punpckhdq xmm0, xmm7
284    sub       ecx, 8
285    movlpd    qword ptr [edx + esi], xmm0
286    lea       edx, [edx + 2 * esi]
287    movhpd    qword ptr [ebx + ebp], xmm0
288    lea       ebx, [ebx + 2 * ebp]
289    jg        convertloop
290
291    mov       esp, [esp + 16]
292    pop       ebp
293    pop       edi
294    pop       esi
295    pop       ebx
296    ret
297  }
298}
299#elif !defined(LIBYUV_DISABLE_X86) && \
300    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
301#define HAS_TRANSPOSE_WX8_SSSE3
302static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
303                               uint8* dst, int dst_stride, int width) {
304  asm volatile (
305    // Read in the data from the source pointer.
306    // First round of bit swap.
307    ".p2align  2                                 \n"
308  "1:                                            \n"
309    "movq       (%0),%%xmm0                      \n"
310    "movq       (%0,%3),%%xmm1                   \n"
311    "lea        (%0,%3,2),%0                     \n"
312    "punpcklbw  %%xmm1,%%xmm0                    \n"
313    "movq       (%0),%%xmm2                      \n"
314    "movdqa     %%xmm0,%%xmm1                    \n"
315    "palignr    $0x8,%%xmm1,%%xmm1               \n"
316    "movq       (%0,%3),%%xmm3                   \n"
317    "lea        (%0,%3,2),%0                     \n"
318    "punpcklbw  %%xmm3,%%xmm2                    \n"
319    "movdqa     %%xmm2,%%xmm3                    \n"
320    "movq       (%0),%%xmm4                      \n"
321    "palignr    $0x8,%%xmm3,%%xmm3               \n"
322    "movq       (%0,%3),%%xmm5                   \n"
323    "lea        (%0,%3,2),%0                     \n"
324    "punpcklbw  %%xmm5,%%xmm4                    \n"
325    "movdqa     %%xmm4,%%xmm5                    \n"
326    "movq       (%0),%%xmm6                      \n"
327    "palignr    $0x8,%%xmm5,%%xmm5               \n"
328    "movq       (%0,%3),%%xmm7                   \n"
329    "lea        (%0,%3,2),%0                     \n"
330    "punpcklbw  %%xmm7,%%xmm6                    \n"
331    "neg        %3                               \n"
332    "movdqa     %%xmm6,%%xmm7                    \n"
333    "lea        0x8(%0,%3,8),%0                  \n"
334    "palignr    $0x8,%%xmm7,%%xmm7               \n"
335    "neg        %3                               \n"
336     // Second round of bit swap.
337    "punpcklwd  %%xmm2,%%xmm0                    \n"
338    "punpcklwd  %%xmm3,%%xmm1                    \n"
339    "movdqa     %%xmm0,%%xmm2                    \n"
340    "movdqa     %%xmm1,%%xmm3                    \n"
341    "palignr    $0x8,%%xmm2,%%xmm2               \n"
342    "palignr    $0x8,%%xmm3,%%xmm3               \n"
343    "punpcklwd  %%xmm6,%%xmm4                    \n"
344    "punpcklwd  %%xmm7,%%xmm5                    \n"
345    "movdqa     %%xmm4,%%xmm6                    \n"
346    "movdqa     %%xmm5,%%xmm7                    \n"
347    "palignr    $0x8,%%xmm6,%%xmm6               \n"
348    "palignr    $0x8,%%xmm7,%%xmm7               \n"
349    // Third round of bit swap.
350    // Write to the destination pointer.
351    "punpckldq  %%xmm4,%%xmm0                    \n"
352    "movq       %%xmm0,(%1)                      \n"
353    "movdqa     %%xmm0,%%xmm4                    \n"
354    "palignr    $0x8,%%xmm4,%%xmm4               \n"
355    "movq       %%xmm4,(%1,%4)                   \n"
356    "lea        (%1,%4,2),%1                     \n"
357    "punpckldq  %%xmm6,%%xmm2                    \n"
358    "movdqa     %%xmm2,%%xmm6                    \n"
359    "movq       %%xmm2,(%1)                      \n"
360    "palignr    $0x8,%%xmm6,%%xmm6               \n"
361    "punpckldq  %%xmm5,%%xmm1                    \n"
362    "movq       %%xmm6,(%1,%4)                   \n"
363    "lea        (%1,%4,2),%1                     \n"
364    "movdqa     %%xmm1,%%xmm5                    \n"
365    "movq       %%xmm1,(%1)                      \n"
366    "palignr    $0x8,%%xmm5,%%xmm5               \n"
367    "movq       %%xmm5,(%1,%4)                   \n"
368    "lea        (%1,%4,2),%1                     \n"
369    "punpckldq  %%xmm7,%%xmm3                    \n"
370    "movq       %%xmm3,(%1)                      \n"
371    "movdqa     %%xmm3,%%xmm7                    \n"
372    "palignr    $0x8,%%xmm7,%%xmm7               \n"
373    "sub        $0x8,%2                          \n"
374    "movq       %%xmm7,(%1,%4)                   \n"
375    "lea        (%1,%4,2),%1                     \n"
376    "jg         1b                               \n"
377    : "+r"(src),    // %0
378      "+r"(dst),    // %1
379      "+r"(width)   // %2
380    : "r"((intptr_t)(src_stride)),  // %3
381      "r"((intptr_t)(dst_stride))   // %4
382    : "memory", "cc"
383  #if defined(__SSE2__)
384      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
385  #endif
386  );
387}
388
389#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
390#define HAS_TRANSPOSE_UVWX8_SSE2
391void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
392                         uint8* dst_a, int dst_stride_a,
393                         uint8* dst_b, int dst_stride_b,
394                         int w);
395  asm (
396    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
397    "push   %ebx                               \n"
398    "push   %esi                               \n"
399    "push   %edi                               \n"
400    "push   %ebp                               \n"
401    "mov    0x14(%esp),%eax                    \n"
402    "mov    0x18(%esp),%edi                    \n"
403    "mov    0x1c(%esp),%edx                    \n"
404    "mov    0x20(%esp),%esi                    \n"
405    "mov    0x24(%esp),%ebx                    \n"
406    "mov    0x28(%esp),%ebp                    \n"
407    "mov    %esp,%ecx                          \n"
408    "sub    $0x14,%esp                         \n"
409    "and    $0xfffffff0,%esp                   \n"
410    "mov    %ecx,0x10(%esp)                    \n"
411    "mov    0x2c(%ecx),%ecx                    \n"
412
413"1:                                            \n"
414    "movdqa (%eax),%xmm0                       \n"
415    "movdqa (%eax,%edi,1),%xmm1                \n"
416    "lea    (%eax,%edi,2),%eax                 \n"
417    "movdqa %xmm0,%xmm7                        \n"
418    "punpcklbw %xmm1,%xmm0                     \n"
419    "punpckhbw %xmm1,%xmm7                     \n"
420    "movdqa %xmm7,%xmm1                        \n"
421    "movdqa (%eax),%xmm2                       \n"
422    "movdqa (%eax,%edi,1),%xmm3                \n"
423    "lea    (%eax,%edi,2),%eax                 \n"
424    "movdqa %xmm2,%xmm7                        \n"
425    "punpcklbw %xmm3,%xmm2                     \n"
426    "punpckhbw %xmm3,%xmm7                     \n"
427    "movdqa %xmm7,%xmm3                        \n"
428    "movdqa (%eax),%xmm4                       \n"
429    "movdqa (%eax,%edi,1),%xmm5                \n"
430    "lea    (%eax,%edi,2),%eax                 \n"
431    "movdqa %xmm4,%xmm7                        \n"
432    "punpcklbw %xmm5,%xmm4                     \n"
433    "punpckhbw %xmm5,%xmm7                     \n"
434    "movdqa %xmm7,%xmm5                        \n"
435    "movdqa (%eax),%xmm6                       \n"
436    "movdqa (%eax,%edi,1),%xmm7                \n"
437    "lea    (%eax,%edi,2),%eax                 \n"
438    "movdqa %xmm5,(%esp)                       \n"
439    "neg    %edi                               \n"
440    "movdqa %xmm6,%xmm5                        \n"
441    "punpcklbw %xmm7,%xmm6                     \n"
442    "punpckhbw %xmm7,%xmm5                     \n"
443    "movdqa %xmm5,%xmm7                        \n"
444    "lea    0x10(%eax,%edi,8),%eax             \n"
445    "neg    %edi                               \n"
446    "movdqa %xmm0,%xmm5                        \n"
447    "punpcklwd %xmm2,%xmm0                     \n"
448    "punpckhwd %xmm2,%xmm5                     \n"
449    "movdqa %xmm5,%xmm2                        \n"
450    "movdqa %xmm1,%xmm5                        \n"
451    "punpcklwd %xmm3,%xmm1                     \n"
452    "punpckhwd %xmm3,%xmm5                     \n"
453    "movdqa %xmm5,%xmm3                        \n"
454    "movdqa %xmm4,%xmm5                        \n"
455    "punpcklwd %xmm6,%xmm4                     \n"
456    "punpckhwd %xmm6,%xmm5                     \n"
457    "movdqa %xmm5,%xmm6                        \n"
458    "movdqa (%esp),%xmm5                       \n"
459    "movdqa %xmm6,(%esp)                       \n"
460    "movdqa %xmm5,%xmm6                        \n"
461    "punpcklwd %xmm7,%xmm5                     \n"
462    "punpckhwd %xmm7,%xmm6                     \n"
463    "movdqa %xmm6,%xmm7                        \n"
464    "movdqa %xmm0,%xmm6                        \n"
465    "punpckldq %xmm4,%xmm0                     \n"
466    "punpckhdq %xmm4,%xmm6                     \n"
467    "movdqa %xmm6,%xmm4                        \n"
468    "movdqa (%esp),%xmm6                       \n"
469    "movlpd %xmm0,(%edx)                       \n"
470    "movhpd %xmm0,(%ebx)                       \n"
471    "movlpd %xmm4,(%edx,%esi,1)                \n"
472    "lea    (%edx,%esi,2),%edx                 \n"
473    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
474    "lea    (%ebx,%ebp,2),%ebx                 \n"
475    "movdqa %xmm2,%xmm0                        \n"
476    "punpckldq %xmm6,%xmm2                     \n"
477    "movlpd %xmm2,(%edx)                       \n"
478    "movhpd %xmm2,(%ebx)                       \n"
479    "punpckhdq %xmm6,%xmm0                     \n"
480    "movlpd %xmm0,(%edx,%esi,1)                \n"
481    "lea    (%edx,%esi,2),%edx                 \n"
482    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
483    "lea    (%ebx,%ebp,2),%ebx                 \n"
484    "movdqa %xmm1,%xmm0                        \n"
485    "punpckldq %xmm5,%xmm1                     \n"
486    "movlpd %xmm1,(%edx)                       \n"
487    "movhpd %xmm1,(%ebx)                       \n"
488    "punpckhdq %xmm5,%xmm0                     \n"
489    "movlpd %xmm0,(%edx,%esi,1)                \n"
490    "lea    (%edx,%esi,2),%edx                 \n"
491    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
492    "lea    (%ebx,%ebp,2),%ebx                 \n"
493    "movdqa %xmm3,%xmm0                        \n"
494    "punpckldq %xmm7,%xmm3                     \n"
495    "movlpd %xmm3,(%edx)                       \n"
496    "movhpd %xmm3,(%ebx)                       \n"
497    "punpckhdq %xmm7,%xmm0                     \n"
498    "sub    $0x8,%ecx                          \n"
499    "movlpd %xmm0,(%edx,%esi,1)                \n"
500    "lea    (%edx,%esi,2),%edx                 \n"
501    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
502    "lea    (%ebx,%ebp,2),%ebx                 \n"
503    "jg     1b                                 \n"
504    "mov    0x10(%esp),%esp                    \n"
505    "pop    %ebp                               \n"
506    "pop    %edi                               \n"
507    "pop    %esi                               \n"
508    "pop    %ebx                               \n"
509#if defined(__native_client__)
510    "pop    %ecx                               \n"
511    "and    $0xffffffe0,%ecx                   \n"
512    "jmp    *%ecx                              \n"
513#else
514    "ret                                       \n"
515#endif
516);
517#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
518    defined(__x86_64__)
519// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
520#define HAS_TRANSPOSE_WX8_FAST_SSSE3
521static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
522                                    uint8* dst, int dst_stride, int width) {
523  asm volatile (
524  // Read in the data from the source pointer.
525  // First round of bit swap.
526  ".p2align  2                                 \n"
527"1:                                            \n"
528  "movdqa     (%0),%%xmm0                      \n"
529  "movdqa     (%0,%3),%%xmm1                   \n"
530  "lea        (%0,%3,2),%0                     \n"
531  "movdqa     %%xmm0,%%xmm8                    \n"
532  "punpcklbw  %%xmm1,%%xmm0                    \n"
533  "punpckhbw  %%xmm1,%%xmm8                    \n"
534  "movdqa     (%0),%%xmm2                      \n"
535  "movdqa     %%xmm0,%%xmm1                    \n"
536  "movdqa     %%xmm8,%%xmm9                    \n"
537  "palignr    $0x8,%%xmm1,%%xmm1               \n"
538  "palignr    $0x8,%%xmm9,%%xmm9               \n"
539  "movdqa     (%0,%3),%%xmm3                   \n"
540  "lea        (%0,%3,2),%0                     \n"
541  "movdqa     %%xmm2,%%xmm10                   \n"
542  "punpcklbw  %%xmm3,%%xmm2                    \n"
543  "punpckhbw  %%xmm3,%%xmm10                   \n"
544  "movdqa     %%xmm2,%%xmm3                    \n"
545  "movdqa     %%xmm10,%%xmm11                  \n"
546  "movdqa     (%0),%%xmm4                      \n"
547  "palignr    $0x8,%%xmm3,%%xmm3               \n"
548  "palignr    $0x8,%%xmm11,%%xmm11             \n"
549  "movdqa     (%0,%3),%%xmm5                   \n"
550  "lea        (%0,%3,2),%0                     \n"
551  "movdqa     %%xmm4,%%xmm12                   \n"
552  "punpcklbw  %%xmm5,%%xmm4                    \n"
553  "punpckhbw  %%xmm5,%%xmm12                   \n"
554  "movdqa     %%xmm4,%%xmm5                    \n"
555  "movdqa     %%xmm12,%%xmm13                  \n"
556  "movdqa     (%0),%%xmm6                      \n"
557  "palignr    $0x8,%%xmm5,%%xmm5               \n"
558  "palignr    $0x8,%%xmm13,%%xmm13             \n"
559  "movdqa     (%0,%3),%%xmm7                   \n"
560  "lea        (%0,%3,2),%0                     \n"
561  "movdqa     %%xmm6,%%xmm14                   \n"
562  "punpcklbw  %%xmm7,%%xmm6                    \n"
563  "punpckhbw  %%xmm7,%%xmm14                   \n"
564  "neg        %3                               \n"
565  "movdqa     %%xmm6,%%xmm7                    \n"
566  "movdqa     %%xmm14,%%xmm15                  \n"
567  "lea        0x10(%0,%3,8),%0                 \n"
568  "palignr    $0x8,%%xmm7,%%xmm7               \n"
569  "palignr    $0x8,%%xmm15,%%xmm15             \n"
570  "neg        %3                               \n"
571   // Second round of bit swap.
572  "punpcklwd  %%xmm2,%%xmm0                    \n"
573  "punpcklwd  %%xmm3,%%xmm1                    \n"
574  "movdqa     %%xmm0,%%xmm2                    \n"
575  "movdqa     %%xmm1,%%xmm3                    \n"
576  "palignr    $0x8,%%xmm2,%%xmm2               \n"
577  "palignr    $0x8,%%xmm3,%%xmm3               \n"
578  "punpcklwd  %%xmm6,%%xmm4                    \n"
579  "punpcklwd  %%xmm7,%%xmm5                    \n"
580  "movdqa     %%xmm4,%%xmm6                    \n"
581  "movdqa     %%xmm5,%%xmm7                    \n"
582  "palignr    $0x8,%%xmm6,%%xmm6               \n"
583  "palignr    $0x8,%%xmm7,%%xmm7               \n"
584  "punpcklwd  %%xmm10,%%xmm8                   \n"
585  "punpcklwd  %%xmm11,%%xmm9                   \n"
586  "movdqa     %%xmm8,%%xmm10                   \n"
587  "movdqa     %%xmm9,%%xmm11                   \n"
588  "palignr    $0x8,%%xmm10,%%xmm10             \n"
589  "palignr    $0x8,%%xmm11,%%xmm11             \n"
590  "punpcklwd  %%xmm14,%%xmm12                  \n"
591  "punpcklwd  %%xmm15,%%xmm13                  \n"
592  "movdqa     %%xmm12,%%xmm14                  \n"
593  "movdqa     %%xmm13,%%xmm15                  \n"
594  "palignr    $0x8,%%xmm14,%%xmm14             \n"
595  "palignr    $0x8,%%xmm15,%%xmm15             \n"
596  // Third round of bit swap.
597  // Write to the destination pointer.
598  "punpckldq  %%xmm4,%%xmm0                    \n"
599  "movq       %%xmm0,(%1)                      \n"
600  "movdqa     %%xmm0,%%xmm4                    \n"
601  "palignr    $0x8,%%xmm4,%%xmm4               \n"
602  "movq       %%xmm4,(%1,%4)                   \n"
603  "lea        (%1,%4,2),%1                     \n"
604  "punpckldq  %%xmm6,%%xmm2                    \n"
605  "movdqa     %%xmm2,%%xmm6                    \n"
606  "movq       %%xmm2,(%1)                      \n"
607  "palignr    $0x8,%%xmm6,%%xmm6               \n"
608  "punpckldq  %%xmm5,%%xmm1                    \n"
609  "movq       %%xmm6,(%1,%4)                   \n"
610  "lea        (%1,%4,2),%1                     \n"
611  "movdqa     %%xmm1,%%xmm5                    \n"
612  "movq       %%xmm1,(%1)                      \n"
613  "palignr    $0x8,%%xmm5,%%xmm5               \n"
614  "movq       %%xmm5,(%1,%4)                   \n"
615  "lea        (%1,%4,2),%1                     \n"
616  "punpckldq  %%xmm7,%%xmm3                    \n"
617  "movq       %%xmm3,(%1)                      \n"
618  "movdqa     %%xmm3,%%xmm7                    \n"
619  "palignr    $0x8,%%xmm7,%%xmm7               \n"
620  "movq       %%xmm7,(%1,%4)                   \n"
621  "lea        (%1,%4,2),%1                     \n"
622  "punpckldq  %%xmm12,%%xmm8                   \n"
623  "movq       %%xmm8,(%1)                      \n"
624  "movdqa     %%xmm8,%%xmm12                   \n"
625  "palignr    $0x8,%%xmm12,%%xmm12             \n"
626  "movq       %%xmm12,(%1,%4)                  \n"
627  "lea        (%1,%4,2),%1                     \n"
628  "punpckldq  %%xmm14,%%xmm10                  \n"
629  "movdqa     %%xmm10,%%xmm14                  \n"
630  "movq       %%xmm10,(%1)                     \n"
631  "palignr    $0x8,%%xmm14,%%xmm14             \n"
632  "punpckldq  %%xmm13,%%xmm9                   \n"
633  "movq       %%xmm14,(%1,%4)                  \n"
634  "lea        (%1,%4,2),%1                     \n"
635  "movdqa     %%xmm9,%%xmm13                   \n"
636  "movq       %%xmm9,(%1)                      \n"
637  "palignr    $0x8,%%xmm13,%%xmm13             \n"
638  "movq       %%xmm13,(%1,%4)                  \n"
639  "lea        (%1,%4,2),%1                     \n"
640  "punpckldq  %%xmm15,%%xmm11                  \n"
641  "movq       %%xmm11,(%1)                     \n"
642  "movdqa     %%xmm11,%%xmm15                  \n"
643  "palignr    $0x8,%%xmm15,%%xmm15             \n"
644  "sub        $0x10,%2                         \n"
645  "movq       %%xmm15,(%1,%4)                  \n"
646  "lea        (%1,%4,2),%1                     \n"
647  "jg         1b                               \n"
648  : "+r"(src),    // %0
649    "+r"(dst),    // %1
650    "+r"(width)   // %2
651  : "r"((intptr_t)(src_stride)),  // %3
652    "r"((intptr_t)(dst_stride))   // %4
653  : "memory", "cc",
654    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
655    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
656);
657}
658
659#define HAS_TRANSPOSE_UVWX8_SSE2
660static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
661                                uint8* dst_a, int dst_stride_a,
662                                uint8* dst_b, int dst_stride_b,
663                                int w) {
664  asm volatile (
665  // Read in the data from the source pointer.
666  // First round of bit swap.
667  ".p2align  2                                 \n"
668"1:                                            \n"
669  "movdqa     (%0),%%xmm0                      \n"
670  "movdqa     (%0,%4),%%xmm1                   \n"
671  "lea        (%0,%4,2),%0                     \n"
672  "movdqa     %%xmm0,%%xmm8                    \n"
673  "punpcklbw  %%xmm1,%%xmm0                    \n"
674  "punpckhbw  %%xmm1,%%xmm8                    \n"
675  "movdqa     %%xmm8,%%xmm1                    \n"
676  "movdqa     (%0),%%xmm2                      \n"
677  "movdqa     (%0,%4),%%xmm3                   \n"
678  "lea        (%0,%4,2),%0                     \n"
679  "movdqa     %%xmm2,%%xmm8                    \n"
680  "punpcklbw  %%xmm3,%%xmm2                    \n"
681  "punpckhbw  %%xmm3,%%xmm8                    \n"
682  "movdqa     %%xmm8,%%xmm3                    \n"
683  "movdqa     (%0),%%xmm4                      \n"
684  "movdqa     (%0,%4),%%xmm5                   \n"
685  "lea        (%0,%4,2),%0                     \n"
686  "movdqa     %%xmm4,%%xmm8                    \n"
687  "punpcklbw  %%xmm5,%%xmm4                    \n"
688  "punpckhbw  %%xmm5,%%xmm8                    \n"
689  "movdqa     %%xmm8,%%xmm5                    \n"
690  "movdqa     (%0),%%xmm6                      \n"
691  "movdqa     (%0,%4),%%xmm7                   \n"
692  "lea        (%0,%4,2),%0                     \n"
693  "movdqa     %%xmm6,%%xmm8                    \n"
694  "punpcklbw  %%xmm7,%%xmm6                    \n"
695  "neg        %4                               \n"
696  "lea        0x10(%0,%4,8),%0                 \n"
697  "punpckhbw  %%xmm7,%%xmm8                    \n"
698  "movdqa     %%xmm8,%%xmm7                    \n"
699  "neg        %4                               \n"
700   // Second round of bit swap.
701  "movdqa     %%xmm0,%%xmm8                    \n"
702  "movdqa     %%xmm1,%%xmm9                    \n"
703  "punpckhwd  %%xmm2,%%xmm8                    \n"
704  "punpckhwd  %%xmm3,%%xmm9                    \n"
705  "punpcklwd  %%xmm2,%%xmm0                    \n"
706  "punpcklwd  %%xmm3,%%xmm1                    \n"
707  "movdqa     %%xmm8,%%xmm2                    \n"
708  "movdqa     %%xmm9,%%xmm3                    \n"
709  "movdqa     %%xmm4,%%xmm8                    \n"
710  "movdqa     %%xmm5,%%xmm9                    \n"
711  "punpckhwd  %%xmm6,%%xmm8                    \n"
712  "punpckhwd  %%xmm7,%%xmm9                    \n"
713  "punpcklwd  %%xmm6,%%xmm4                    \n"
714  "punpcklwd  %%xmm7,%%xmm5                    \n"
715  "movdqa     %%xmm8,%%xmm6                    \n"
716  "movdqa     %%xmm9,%%xmm7                    \n"
717  // Third round of bit swap.
718  // Write to the destination pointer.
719  "movdqa     %%xmm0,%%xmm8                    \n"
720  "punpckldq  %%xmm4,%%xmm0                    \n"
721  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
722  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
723  "punpckhdq  %%xmm4,%%xmm8                    \n"
724  "movlpd     %%xmm8,(%1,%5)                   \n"
725  "lea        (%1,%5,2),%1                     \n"
726  "movhpd     %%xmm8,(%2,%6)                   \n"
727  "lea        (%2,%6,2),%2                     \n"
728  "movdqa     %%xmm2,%%xmm8                    \n"
729  "punpckldq  %%xmm6,%%xmm2                    \n"
730  "movlpd     %%xmm2,(%1)                      \n"
731  "movhpd     %%xmm2,(%2)                      \n"
732  "punpckhdq  %%xmm6,%%xmm8                    \n"
733  "movlpd     %%xmm8,(%1,%5)                   \n"
734  "lea        (%1,%5,2),%1                     \n"
735  "movhpd     %%xmm8,(%2,%6)                   \n"
736  "lea        (%2,%6,2),%2                     \n"
737  "movdqa     %%xmm1,%%xmm8                    \n"
738  "punpckldq  %%xmm5,%%xmm1                    \n"
739  "movlpd     %%xmm1,(%1)                      \n"
740  "movhpd     %%xmm1,(%2)                      \n"
741  "punpckhdq  %%xmm5,%%xmm8                    \n"
742  "movlpd     %%xmm8,(%1,%5)                   \n"
743  "lea        (%1,%5,2),%1                     \n"
744  "movhpd     %%xmm8,(%2,%6)                   \n"
745  "lea        (%2,%6,2),%2                     \n"
746  "movdqa     %%xmm3,%%xmm8                    \n"
747  "punpckldq  %%xmm7,%%xmm3                    \n"
748  "movlpd     %%xmm3,(%1)                      \n"
749  "movhpd     %%xmm3,(%2)                      \n"
750  "punpckhdq  %%xmm7,%%xmm8                    \n"
751  "sub        $0x8,%3                          \n"
752  "movlpd     %%xmm8,(%1,%5)                   \n"
753  "lea        (%1,%5,2),%1                     \n"
754  "movhpd     %%xmm8,(%2,%6)                   \n"
755  "lea        (%2,%6,2),%2                     \n"
756  "jg         1b                               \n"
757  : "+r"(src),    // %0
758    "+r"(dst_a),  // %1
759    "+r"(dst_b),  // %2
760    "+r"(w)   // %3
761  : "r"((intptr_t)(src_stride)),    // %4
762    "r"((intptr_t)(dst_stride_a)),  // %5
763    "r"((intptr_t)(dst_stride_b))   // %6
764  : "memory", "cc",
765    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
766    "xmm8", "xmm9"
767);
768}
769#endif
770#endif
771
772static void TransposeWx8_C(const uint8* src, int src_stride,
773                           uint8* dst, int dst_stride,
774                           int width) {
775  int i;
776  for (i = 0; i < width; ++i) {
777    dst[0] = src[0 * src_stride];
778    dst[1] = src[1 * src_stride];
779    dst[2] = src[2 * src_stride];
780    dst[3] = src[3 * src_stride];
781    dst[4] = src[4 * src_stride];
782    dst[5] = src[5 * src_stride];
783    dst[6] = src[6 * src_stride];
784    dst[7] = src[7 * src_stride];
785    ++src;
786    dst += dst_stride;
787  }
788}
789
790static void TransposeWxH_C(const uint8* src, int src_stride,
791                           uint8* dst, int dst_stride,
792                           int width, int height) {
793  int i;
794  for (i = 0; i < width; ++i) {
795    int j;
796    for (j = 0; j < height; ++j) {
797      dst[i * dst_stride + j] = src[j * src_stride + i];
798    }
799  }
800}
801
802LIBYUV_API
803void TransposePlane(const uint8* src, int src_stride,
804                    uint8* dst, int dst_stride,
805                    int width, int height) {
806  int i = height;
807  void (*TransposeWx8)(const uint8* src, int src_stride,
808                       uint8* dst, int dst_stride,
809                       int width) = TransposeWx8_C;
810#if defined(HAS_TRANSPOSE_WX8_NEON)
811  if (TestCpuFlag(kCpuHasNEON)) {
812    TransposeWx8 = TransposeWx8_NEON;
813  }
814#endif
815#if defined(HAS_TRANSPOSE_WX8_SSSE3)
816  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
817    TransposeWx8 = TransposeWx8_SSSE3;
818  }
819#endif
820#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
821  if (TestCpuFlag(kCpuHasSSSE3) &&
822      IS_ALIGNED(width, 16) &&
823      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
824    TransposeWx8 = TransposeWx8_FAST_SSSE3;
825  }
826#endif
827#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
828  if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
829    if (IS_ALIGNED(width, 4) &&
830        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
831      TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
832    } else {
833      TransposeWx8 = TransposeWx8_MIPS_DSPR2;
834    }
835  }
836#endif
837
838  // Work across the source in 8x8 tiles
839  while (i >= 8) {
840    TransposeWx8(src, src_stride, dst, dst_stride, width);
841    src += 8 * src_stride;    // Go down 8 rows.
842    dst += 8;                 // Move over 8 columns.
843    i -= 8;
844  }
845
846  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
847}
848
849LIBYUV_API
850void RotatePlane90(const uint8* src, int src_stride,
851                   uint8* dst, int dst_stride,
852                   int width, int height) {
853  // Rotate by 90 is a transpose with the source read
854  // from bottom to top. So set the source pointer to the end
855  // of the buffer and flip the sign of the source stride.
856  src += src_stride * (height - 1);
857  src_stride = -src_stride;
858  TransposePlane(src, src_stride, dst, dst_stride, width, height);
859}
860
861LIBYUV_API
862void RotatePlane270(const uint8* src, int src_stride,
863                    uint8* dst, int dst_stride,
864                    int width, int height) {
865  // Rotate by 270 is a transpose with the destination written
866  // from bottom to top. So set the destination pointer to the end
867  // of the buffer and flip the sign of the destination stride.
868  dst += dst_stride * (width - 1);
869  dst_stride = -dst_stride;
870  TransposePlane(src, src_stride, dst, dst_stride, width, height);
871}
872
873LIBYUV_API
874void RotatePlane180(const uint8* src, int src_stride,
875                    uint8* dst, int dst_stride,
876                    int width, int height) {
877  // Swap first and last row and mirror the content. Uses a temporary row.
878  align_buffer_64(row, width);
879  const uint8* src_bot = src + src_stride * (height - 1);
880  uint8* dst_bot = dst + dst_stride * (height - 1);
881  int half_height = (height + 1) >> 1;
882  int y;
883  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
884  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
885#if defined(HAS_MIRRORROW_NEON)
886  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
887    MirrorRow = MirrorRow_NEON;
888  }
889#endif
890#if defined(HAS_MIRRORROW_SSE2)
891  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
892      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
893      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
894    MirrorRow = MirrorRow_SSE2;
895  }
896#endif
897#if defined(HAS_MIRRORROW_SSSE3)
898  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
899      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
900      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
901    MirrorRow = MirrorRow_SSSE3;
902  }
903#endif
904#if defined(HAS_MIRRORROW_AVX2)
905  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
906    MirrorRow = MirrorRow_AVX2;
907  }
908#endif
909#if defined(HAS_MIRRORROW_MIPS_DSPR2)
910  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
911      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
912      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
913    MirrorRow = MirrorRow_MIPS_DSPR2;
914  }
915#endif
916#if defined(HAS_COPYROW_NEON)
917  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
918    CopyRow = CopyRow_NEON;
919  }
920#endif
921#if defined(HAS_COPYROW_X86)
922  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
923    CopyRow = CopyRow_X86;
924  }
925#endif
926#if defined(HAS_COPYROW_SSE2)
927  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
928      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
929      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
930    CopyRow = CopyRow_SSE2;
931  }
932#endif
933#if defined(HAS_COPYROW_ERMS)
934  if (TestCpuFlag(kCpuHasERMS)) {
935    CopyRow = CopyRow_ERMS;
936  }
937#endif
938#if defined(HAS_COPYROW_MIPS)
939  if (TestCpuFlag(kCpuHasMIPS)) {
940    CopyRow = CopyRow_MIPS;
941  }
942#endif
943
944  // Odd height will harmlessly mirror the middle row twice.
945  for (y = 0; y < half_height; ++y) {
946    MirrorRow(src, row, width);  // Mirror first row into a buffer
947    src += src_stride;
948    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
949    dst += dst_stride;
950    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
951    src_bot -= src_stride;
952    dst_bot -= dst_stride;
953  }
954  free_aligned_buffer_64(row);
955}
956
957static void TransposeUVWx8_C(const uint8* src, int src_stride,
958                             uint8* dst_a, int dst_stride_a,
959                             uint8* dst_b, int dst_stride_b,
960                             int width) {
961  int i;
962  for (i = 0; i < width; ++i) {
963    dst_a[0] = src[0 * src_stride + 0];
964    dst_b[0] = src[0 * src_stride + 1];
965    dst_a[1] = src[1 * src_stride + 0];
966    dst_b[1] = src[1 * src_stride + 1];
967    dst_a[2] = src[2 * src_stride + 0];
968    dst_b[2] = src[2 * src_stride + 1];
969    dst_a[3] = src[3 * src_stride + 0];
970    dst_b[3] = src[3 * src_stride + 1];
971    dst_a[4] = src[4 * src_stride + 0];
972    dst_b[4] = src[4 * src_stride + 1];
973    dst_a[5] = src[5 * src_stride + 0];
974    dst_b[5] = src[5 * src_stride + 1];
975    dst_a[6] = src[6 * src_stride + 0];
976    dst_b[6] = src[6 * src_stride + 1];
977    dst_a[7] = src[7 * src_stride + 0];
978    dst_b[7] = src[7 * src_stride + 1];
979    src += 2;
980    dst_a += dst_stride_a;
981    dst_b += dst_stride_b;
982  }
983}
984
985static void TransposeUVWxH_C(const uint8* src, int src_stride,
986                             uint8* dst_a, int dst_stride_a,
987                             uint8* dst_b, int dst_stride_b,
988                             int width, int height) {
989  int i;
990  for (i = 0; i < width * 2; i += 2) {
991    int j;
992    for (j = 0; j < height; ++j) {
993      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
994      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
995    }
996  }
997}
998
999LIBYUV_API
1000void TransposeUV(const uint8* src, int src_stride,
1001                 uint8* dst_a, int dst_stride_a,
1002                 uint8* dst_b, int dst_stride_b,
1003                 int width, int height) {
1004  int i = height;
1005  void (*TransposeUVWx8)(const uint8* src, int src_stride,
1006                         uint8* dst_a, int dst_stride_a,
1007                         uint8* dst_b, int dst_stride_b,
1008                         int width) = TransposeUVWx8_C;
1009#if defined(HAS_TRANSPOSE_UVWX8_NEON)
1010  if (TestCpuFlag(kCpuHasNEON)) {
1011    TransposeUVWx8 = TransposeUVWx8_NEON;
1012  }
1013#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
1014  if (TestCpuFlag(kCpuHasSSE2) &&
1015      IS_ALIGNED(width, 8) &&
1016      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1017    TransposeUVWx8 = TransposeUVWx8_SSE2;
1018  }
1019#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
1020  if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
1021      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1022    TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
1023  }
1024#endif
1025
1026  // Work through the source in 8x8 tiles.
1027  while (i >= 8) {
1028    TransposeUVWx8(src, src_stride,
1029                   dst_a, dst_stride_a,
1030                   dst_b, dst_stride_b,
1031                   width);
1032    src += 8 * src_stride;    // Go down 8 rows.
1033    dst_a += 8;               // Move over 8 columns.
1034    dst_b += 8;               // Move over 8 columns.
1035    i -= 8;
1036  }
1037
1038  TransposeUVWxH_C(src, src_stride,
1039                   dst_a, dst_stride_a,
1040                   dst_b, dst_stride_b,
1041                   width, i);
1042}
1043
1044LIBYUV_API
1045void RotateUV90(const uint8* src, int src_stride,
1046                uint8* dst_a, int dst_stride_a,
1047                uint8* dst_b, int dst_stride_b,
1048                int width, int height) {
1049  src += src_stride * (height - 1);
1050  src_stride = -src_stride;
1051
1052  TransposeUV(src, src_stride,
1053              dst_a, dst_stride_a,
1054              dst_b, dst_stride_b,
1055              width, height);
1056}
1057
1058LIBYUV_API
1059void RotateUV270(const uint8* src, int src_stride,
1060                 uint8* dst_a, int dst_stride_a,
1061                 uint8* dst_b, int dst_stride_b,
1062                 int width, int height) {
1063  dst_a += dst_stride_a * (width - 1);
1064  dst_b += dst_stride_b * (width - 1);
1065  dst_stride_a = -dst_stride_a;
1066  dst_stride_b = -dst_stride_b;
1067
1068  TransposeUV(src, src_stride,
1069              dst_a, dst_stride_a,
1070              dst_b, dst_stride_b,
1071              width, height);
1072}
1073
1074// Rotate 180 is a horizontal and vertical flip.
1075LIBYUV_API
1076void RotateUV180(const uint8* src, int src_stride,
1077                 uint8* dst_a, int dst_stride_a,
1078                 uint8* dst_b, int dst_stride_b,
1079                 int width, int height) {
1080  int i;
1081  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
1082      MirrorUVRow_C;
1083#if defined(HAS_MIRRORUVROW_NEON)
1084  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
1085    MirrorRowUV = MirrorUVRow_NEON;
1086  }
1087#elif defined(HAS_MIRRORROW_UV_SSSE3)
1088  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
1089      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
1090    MirrorRowUV = MirrorUVRow_SSSE3;
1091  }
1092#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
1093  if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
1094      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
1095    MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
1096  }
1097#endif
1098
1099  dst_a += dst_stride_a * (height - 1);
1100  dst_b += dst_stride_b * (height - 1);
1101
1102  for (i = 0; i < height; ++i) {
1103    MirrorRowUV(src, dst_a, dst_b, width);
1104    src += src_stride;
1105    dst_a -= dst_stride_a;
1106    dst_b -= dst_stride_b;
1107  }
1108}
1109
1110LIBYUV_API
1111int RotatePlane(const uint8* src, int src_stride,
1112                uint8* dst, int dst_stride,
1113                int width, int height,
1114                enum RotationMode mode) {
1115  if (!src || width <= 0 || height == 0 || !dst) {
1116    return -1;
1117  }
1118
1119  // Negative height means invert the image.
1120  if (height < 0) {
1121    height = -height;
1122    src = src + (height - 1) * src_stride;
1123    src_stride = -src_stride;
1124  }
1125
1126  switch (mode) {
1127    case kRotate0:
1128      // copy frame
1129      CopyPlane(src, src_stride,
1130                dst, dst_stride,
1131                width, height);
1132      return 0;
1133    case kRotate90:
1134      RotatePlane90(src, src_stride,
1135                    dst, dst_stride,
1136                    width, height);
1137      return 0;
1138    case kRotate270:
1139      RotatePlane270(src, src_stride,
1140                     dst, dst_stride,
1141                     width, height);
1142      return 0;
1143    case kRotate180:
1144      RotatePlane180(src, src_stride,
1145                     dst, dst_stride,
1146                     width, height);
1147      return 0;
1148    default:
1149      break;
1150  }
1151  return -1;
1152}
1153
1154LIBYUV_API
1155int I420Rotate(const uint8* src_y, int src_stride_y,
1156               const uint8* src_u, int src_stride_u,
1157               const uint8* src_v, int src_stride_v,
1158               uint8* dst_y, int dst_stride_y,
1159               uint8* dst_u, int dst_stride_u,
1160               uint8* dst_v, int dst_stride_v,
1161               int width, int height,
1162               enum RotationMode mode) {
1163  int halfwidth = (width + 1) >> 1;
1164  int halfheight = (height + 1) >> 1;
1165  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
1166      !dst_y || !dst_u || !dst_v) {
1167    return -1;
1168  }
1169
1170  // Negative height means invert the image.
1171  if (height < 0) {
1172    height = -height;
1173    halfheight = (height + 1) >> 1;
1174    src_y = src_y + (height - 1) * src_stride_y;
1175    src_u = src_u + (halfheight - 1) * src_stride_u;
1176    src_v = src_v + (halfheight - 1) * src_stride_v;
1177    src_stride_y = -src_stride_y;
1178    src_stride_u = -src_stride_u;
1179    src_stride_v = -src_stride_v;
1180  }
1181
1182  switch (mode) {
1183    case kRotate0:
1184      // copy frame
1185      return I420Copy(src_y, src_stride_y,
1186                      src_u, src_stride_u,
1187                      src_v, src_stride_v,
1188                      dst_y, dst_stride_y,
1189                      dst_u, dst_stride_u,
1190                      dst_v, dst_stride_v,
1191                      width, height);
1192    case kRotate90:
1193      RotatePlane90(src_y, src_stride_y,
1194                    dst_y, dst_stride_y,
1195                    width, height);
1196      RotatePlane90(src_u, src_stride_u,
1197                    dst_u, dst_stride_u,
1198                    halfwidth, halfheight);
1199      RotatePlane90(src_v, src_stride_v,
1200                    dst_v, dst_stride_v,
1201                    halfwidth, halfheight);
1202      return 0;
1203    case kRotate270:
1204      RotatePlane270(src_y, src_stride_y,
1205                     dst_y, dst_stride_y,
1206                     width, height);
1207      RotatePlane270(src_u, src_stride_u,
1208                     dst_u, dst_stride_u,
1209                     halfwidth, halfheight);
1210      RotatePlane270(src_v, src_stride_v,
1211                     dst_v, dst_stride_v,
1212                     halfwidth, halfheight);
1213      return 0;
1214    case kRotate180:
1215      RotatePlane180(src_y, src_stride_y,
1216                     dst_y, dst_stride_y,
1217                     width, height);
1218      RotatePlane180(src_u, src_stride_u,
1219                     dst_u, dst_stride_u,
1220                     halfwidth, halfheight);
1221      RotatePlane180(src_v, src_stride_v,
1222                     dst_v, dst_stride_v,
1223                     halfwidth, halfheight);
1224      return 0;
1225    default:
1226      break;
1227  }
1228  return -1;
1229}
1230
1231LIBYUV_API
1232int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
1233                     const uint8* src_uv, int src_stride_uv,
1234                     uint8* dst_y, int dst_stride_y,
1235                     uint8* dst_u, int dst_stride_u,
1236                     uint8* dst_v, int dst_stride_v,
1237                     int width, int height,
1238                     enum RotationMode mode) {
1239  int halfwidth = (width + 1) >> 1;
1240  int halfheight = (height + 1) >> 1;
1241  if (!src_y || !src_uv || width <= 0 || height == 0 ||
1242      !dst_y || !dst_u || !dst_v) {
1243    return -1;
1244  }
1245
1246  // Negative height means invert the image.
1247  if (height < 0) {
1248    height = -height;
1249    halfheight = (height + 1) >> 1;
1250    src_y = src_y + (height - 1) * src_stride_y;
1251    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
1252    src_stride_y = -src_stride_y;
1253    src_stride_uv = -src_stride_uv;
1254  }
1255
1256  switch (mode) {
1257    case kRotate0:
1258      // copy frame
1259      return NV12ToI420(src_y, src_stride_y,
1260                        src_uv, src_stride_uv,
1261                        dst_y, dst_stride_y,
1262                        dst_u, dst_stride_u,
1263                        dst_v, dst_stride_v,
1264                        width, height);
1265    case kRotate90:
1266      RotatePlane90(src_y, src_stride_y,
1267                    dst_y, dst_stride_y,
1268                    width, height);
1269      RotateUV90(src_uv, src_stride_uv,
1270                 dst_u, dst_stride_u,
1271                 dst_v, dst_stride_v,
1272                 halfwidth, halfheight);
1273      return 0;
1274    case kRotate270:
1275      RotatePlane270(src_y, src_stride_y,
1276                     dst_y, dst_stride_y,
1277                     width, height);
1278      RotateUV270(src_uv, src_stride_uv,
1279                  dst_u, dst_stride_u,
1280                  dst_v, dst_stride_v,
1281                  halfwidth, halfheight);
1282      return 0;
1283    case kRotate180:
1284      RotatePlane180(src_y, src_stride_y,
1285                     dst_y, dst_stride_y,
1286                     width, height);
1287      RotateUV180(src_uv, src_stride_uv,
1288                  dst_u, dst_stride_u,
1289                  dst_v, dst_stride_v,
1290                  halfwidth, halfheight);
1291      return 0;
1292    default:
1293      break;
1294  }
1295  return -1;
1296}
1297
1298#ifdef __cplusplus
1299}  // extern "C"
1300}  // namespace libyuv
1301#endif
1302