17cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde/*
233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
37cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *
47cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  Use of this source code is governed by a BSD-style license
57cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  that can be found in the LICENSE file in the root of the source
67cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  tree. An additional intellectual property rights grant can be found
77cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  in the file PATENTS.  All contributing project authors may
87cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  be found in the AUTHORS file in the root of the source tree.
97cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde */
107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#include "libyuv/rotate.h"
127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#include "libyuv/cpu_id.h"
1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/convert.h"
1533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/planar_functions.h"
1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h"
177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordenamespace libyuv {
2033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampextern "C" {
2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && \
2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__APPLE__) && defined(__i386__)
2633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define DECLARE_FUNCTION(name)                                                 \
2733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".text                                     \n"                             \
2833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".private_extern _" #name "                \n"                             \
2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".align 4,0x90                             \n"                             \
3033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp"_" #name ":                                   \n"
3133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
3233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define DECLARE_FUNCTION(name)                                                 \
3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".text                                     \n"                             \
3433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".align 4,0x90                             \n"                             \
3533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp"_" #name ":                                   \n"
367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#else
3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define DECLARE_FUNCTION(name)                                                 \
3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".text                                     \n"                             \
3933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".align 4,0x90                             \n"                             \
4033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#name ":                                       \n"
417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
4533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_MIRRORROW_NEON
4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRow_NEON(const uint8* src, uint8* dst, int width);
4733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define HAS_MIRRORROW_UV_NEON
4833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRowUV_NEON(const uint8* src,
497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                        uint8* dst_a, uint8* dst_b,
507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                        int width);
517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define HAS_TRANSPOSE_WX8_NEON
527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid TransposeWx8_NEON(const uint8* src, int src_stride,
537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                       uint8* dst, int dst_stride, int width);
547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define HAS_TRANSPOSE_UVWX8_NEON
557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid TransposeUVWx8_NEON(const uint8* src, int src_stride,
567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                         uint8* dst_a, int dst_stride_a,
577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                         uint8* dst_b, int dst_stride_b,
587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                         int width);
5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // defined(__ARM_NEON__)
607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define HAS_TRANSPOSE_WX8_SSSE3
6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeWx8_SSSE3(const uint8* src, int src_stride,
657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                               uint8* dst, int dst_stride, int width) {
6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push      edi
687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push      esi
697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push      ebp
707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       eax, [esp + 12 + 4]   // src
717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edi, [esp + 12 + 8]   // src_stride
727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 12 + 12]  // dst
737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       esi, [esp + 12 + 16]  // dst_stride
747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [esp + 12 + 20]  // width
7533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // Read in the data from the source pointer.
777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // First round of bit swap.
7833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
7933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      xmm0, qword ptr [eax]
817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebp, [eax + 8]
827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      xmm1, qword ptr [eax + edi]
837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 2 * edi]
847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklbw xmm0, xmm1
857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      xmm2, qword ptr [eax]
867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm1, xmm0
877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm1, xmm1, 8
887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      xmm3, qword ptr [eax + edi]
897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 2 * edi]
907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklbw xmm2, xmm3
917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm3, xmm2
927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      xmm4, qword ptr [eax]
937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm3, xmm3, 8
947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      xmm5, qword ptr [eax + edi]
957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklbw xmm4, xmm5
967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 2 * edi]
977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, xmm4
987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      xmm6, qword ptr [eax]
997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm5, xmm5, 8
1007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      xmm7, qword ptr [eax + edi]
1017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklbw xmm6, xmm7
1027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       eax, ebp
1037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, xmm6
1047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm7, xmm7, 8
1057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // Second round of bit swap.
1067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklwd xmm0, xmm2
1077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklwd xmm1, xmm3
1087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm2, xmm0
1097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm3, xmm1
1107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm2, xmm2, 8
1117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm3, xmm3, 8
1127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklwd xmm4, xmm6
1137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklwd xmm5, xmm7
1147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, xmm4
1157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, xmm5
1167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm6, xmm6, 8
1177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm7, xmm7, 8
1187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // Third round of bit swap.
1197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // Write to the destination pointer.
1207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckldq xmm0, xmm4
1217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      qword ptr [edx], xmm0
1227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm4, xmm0
1237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm4, xmm4, 8
1247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      qword ptr [edx + esi], xmm4
1257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2 * esi]
1267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckldq xmm2, xmm6
1277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, xmm2
1287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm6, xmm6, 8
1297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      qword ptr [edx], xmm2
1307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckldq xmm1, xmm5
1317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      qword ptr [edx + esi], xmm6
1327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2 * esi]
1337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, xmm1
1347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      qword ptr [edx], xmm1
1357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm5, xmm5, 8
1367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckldq xmm3, xmm7
1377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      qword ptr [edx + esi], xmm5
1387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2 * esi]
1397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      qword ptr [edx], xmm3
1407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, xmm3
1417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm7, xmm7, 8
14233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 8
1437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      qword ptr [edx + esi], xmm7
1447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2 * esi]
14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
1467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop       ebp
1487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop       esi
1497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop       edi
1507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
1517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
1527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
1537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define HAS_TRANSPOSE_UVWX8_SSE2
15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
1567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
1577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                uint8* dst_a, int dst_stride_a,
1587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                uint8* dst_b, int dst_stride_b,
1597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                int w) {
16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
1617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push      ebx
1627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push      esi
1637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push      edi
1647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push      ebp
1657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       eax, [esp + 16 + 4]   // src
1667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edi, [esp + 16 + 8]   // src_stride
1677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 16 + 12]  // dst_a
1687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       esi, [esp + 16 + 16]  // dst_stride_a
1697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ebx, [esp + 16 + 20]  // dst_b
1707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ebp, [esp + 16 + 24]  // dst_stride_b
1717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, esp
1727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub       esp, 4 + 16
1737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    and       esp, ~15
1747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       [esp + 16], ecx
1757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [ecx + 16 + 28]  // w
17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
1797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // Read in the data from the source pointer.
1807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // First round of bit swap.
1817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm0, [eax]
1827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm1, [eax + edi]
1837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 2 * edi]
1847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, xmm0  // use xmm7 as temp register.
1857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklbw xmm0, xmm1
1867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhbw xmm7, xmm1
1877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm1, xmm7
1887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm2, [eax]
1897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm3, [eax + edi]
1907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 2 * edi]
1917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, xmm2
1927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklbw xmm2, xmm3
1937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhbw xmm7, xmm3
1947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm3, xmm7
1957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm4, [eax]
1967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, [eax + edi]
1977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 2 * edi]
1987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, xmm4
1997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklbw xmm4, xmm5
2007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhbw xmm7, xmm5
2017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, xmm7
2027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, [eax]
2037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, [eax + edi]
2047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 2 * edi]
2057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [esp], xmm5  // backup xmm5
2067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    neg       edi
2077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, xmm6   // use xmm5 as temp register.
2087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklbw xmm6, xmm7
2097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhbw xmm5, xmm7
2107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, xmm5
2117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 8 * edi + 16]
2127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    neg       edi
2137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // Second round of bit swap.
2147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, xmm0
2157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklwd xmm0, xmm2
2167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhwd xmm5, xmm2
2177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm2, xmm5
2187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, xmm1
2197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklwd xmm1, xmm3
2207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhwd xmm5, xmm3
2217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm3, xmm5
2227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, xmm4
2237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklwd xmm4, xmm6
2247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhwd xmm5, xmm6
2257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, xmm5
2267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm5, [esp]  // restore xmm5
2277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [esp], xmm6  // backup xmm6
2287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, xmm5    // use xmm6 as temp register.
2297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpcklwd xmm5, xmm7
2307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhwd xmm6, xmm7
2317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm7, xmm6
2327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // Third round of bit swap.
2337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // Write to the destination pointer.
2347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, xmm0
2357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckldq xmm0, xmm4
2367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhdq xmm6, xmm4
2377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm4, xmm6
2387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, [esp]  // restore xmm6
2397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlpd    qword ptr [edx], xmm0
2407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhpd    qword ptr [ebx], xmm0
2417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlpd    qword ptr [edx + esi], xmm4
2427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2 * esi]
2437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhpd    qword ptr [ebx + ebp], xmm4
2447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebx, [ebx + 2 * ebp]
2457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
2467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckldq xmm2, xmm6
2477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlpd    qword ptr [edx], xmm2
2487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhpd    qword ptr [ebx], xmm2
2497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhdq xmm0, xmm6
2507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlpd    qword ptr [edx + esi], xmm0
2517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2 * esi]
2527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhpd    qword ptr [ebx + ebp], xmm0
2537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebx, [ebx + 2 * ebp]
2547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
2557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckldq xmm1, xmm5
2567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlpd    qword ptr [edx], xmm1
2577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhpd    qword ptr [ebx], xmm1
2587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhdq xmm0, xmm5
2597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlpd    qword ptr [edx + esi], xmm0
2607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2 * esi]
2617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhpd    qword ptr [ebx + ebp], xmm0
2627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebx, [ebx + 2 * ebp]
2637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
2647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckldq xmm3, xmm7
2657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlpd    qword ptr [edx], xmm3
2667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhpd    qword ptr [ebx], xmm3
2677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    punpckhdq xmm0, xmm7
26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 8
2697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlpd    qword ptr [edx + esi], xmm0
2707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2 * esi]
2717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhpd    qword ptr [ebx + ebp], xmm0
2727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebx, [ebx + 2 * ebp]
27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
2747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       esp, [esp + 16]
2767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop       ebp
2777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop       edi
2787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop       esi
2797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop       ebx
2807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
2817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
2827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
2847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define HAS_TRANSPOSE_WX8_SSSE3
2857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeWx8_SSSE3(const uint8* src, int src_stride,
2867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                               uint8* dst, int dst_stride, int width) {
28733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Read in the data from the source pointer.
28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // First round of bit swap.
29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                                 \n"
29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                            \n"
29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%0),%%xmm0                      \n"
29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%0,%3),%%xmm1                   \n"
29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        (%0,%3,2),%0                     \n"
29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm1,%%xmm0                    \n"
29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%0),%%xmm2                      \n"
29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,%%xmm1                    \n"
29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm1,%%xmm1               \n"
29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%0,%3),%%xmm3                   \n"
30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        (%0,%3,2),%0                     \n"
30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm3,%%xmm2                    \n"
30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm2,%%xmm3                    \n"
30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%0),%%xmm4                      \n"
30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm3,%%xmm3               \n"
30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%0,%3),%%xmm5                   \n"
30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        (%0,%3,2),%0                     \n"
30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm5,%%xmm4                    \n"
30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm4,%%xmm5                    \n"
30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%0),%%xmm6                      \n"
31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm5,%%xmm5               \n"
31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%0,%3),%%xmm7                   \n"
31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        (%0,%3,2),%0                     \n"
31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm7,%%xmm6                    \n"
31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "neg        %3                               \n"
31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm6,%%xmm7                    \n"
31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x8(%0,%3,8),%0                  \n"
31733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm7,%%xmm7               \n"
31833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "neg        %3                               \n"
31933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp     // Second round of bit swap.
32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd  %%xmm2,%%xmm0                    \n"
32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd  %%xmm3,%%xmm1                    \n"
32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,%%xmm2                    \n"
32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm1,%%xmm3                    \n"
32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm2,%%xmm2               \n"
32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm3,%%xmm3               \n"
32633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd  %%xmm6,%%xmm4                    \n"
32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd  %%xmm7,%%xmm5                    \n"
32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm4,%%xmm6                    \n"
32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm5,%%xmm7                    \n"
33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm6,%%xmm6               \n"
33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm7,%%xmm7               \n"
33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Third round of bit swap.
33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Write to the destination pointer.
33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq  %%xmm4,%%xmm0                    \n"
33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       %%xmm0,(%1)                      \n"
33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,%%xmm4                    \n"
33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm4,%%xmm4               \n"
33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       %%xmm4,(%1,%4)                   \n"
33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        (%1,%4,2),%1                     \n"
34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq  %%xmm6,%%xmm2                    \n"
34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm2,%%xmm6                    \n"
34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       %%xmm2,(%1)                      \n"
34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm6,%%xmm6               \n"
34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq  %%xmm5,%%xmm1                    \n"
34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       %%xmm6,(%1,%4)                   \n"
34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        (%1,%4,2),%1                     \n"
34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm1,%%xmm5                    \n"
34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       %%xmm1,(%1)                      \n"
34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm5,%%xmm5               \n"
35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       %%xmm5,(%1,%4)                   \n"
35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        (%1,%4,2),%1                     \n"
35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq  %%xmm7,%%xmm3                    \n"
35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       %%xmm3,(%1)                      \n"
35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm3,%%xmm7                    \n"
35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr    $0x8,%%xmm7,%%xmm7               \n"
35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub        $0x8,%2                          \n"
35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       %%xmm7,(%1,%4)                   \n"
35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        (%1,%4,2),%1                     \n"
35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg         1b                               \n"
36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+r"(src),    // %0
36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(dst),    // %1
36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+r"(width)   // %2
36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "r"(static_cast<intptr_t>(src_stride)),  // %3
36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "r"(static_cast<intptr_t>(dst_stride))   // %4
36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "memory", "cc"
36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  #if defined(__SSE2__)
36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  #endif
36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
3707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
3717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
3737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define HAS_TRANSPOSE_UVWX8_SSE2
3747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
3757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                    uint8* dst_a, int dst_stride_a,
3767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                    uint8* dst_b, int dst_stride_b,
3777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                    int w);
37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm (
37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    DECLARE_FUNCTION(TransposeUVWx8_SSE2)
38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "push   %ebx                               \n"
38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "push   %esi                               \n"
38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "push   %edi                               \n"
38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "push   %ebp                               \n"
38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    0x14(%esp),%eax                    \n"
38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    0x18(%esp),%edi                    \n"
38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    0x1c(%esp),%edx                    \n"
38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    0x20(%esp),%esi                    \n"
38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    0x24(%esp),%ebx                    \n"
38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    0x28(%esp),%ebp                    \n"
39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    %esp,%ecx                          \n"
39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub    $0x14,%esp                         \n"
39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "and    $0xfffffff0,%esp                   \n"
39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    %ecx,0x10(%esp)                    \n"
39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    0x2c(%ecx),%ecx                    \n"
3957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp"1:                                            \n"
39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%eax),%xmm0                       \n"
39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%eax,%edi,1),%xmm1                \n"
39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%eax,%edi,2),%eax                 \n"
40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm0,%xmm7                        \n"
40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %xmm1,%xmm0                     \n"
40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %xmm1,%xmm7                     \n"
40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm7,%xmm1                        \n"
40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%eax),%xmm2                       \n"
40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%eax,%edi,1),%xmm3                \n"
40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%eax,%edi,2),%eax                 \n"
40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm2,%xmm7                        \n"
40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %xmm3,%xmm2                     \n"
40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %xmm3,%xmm7                     \n"
41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm7,%xmm3                        \n"
41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%eax),%xmm4                       \n"
41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%eax,%edi,1),%xmm5                \n"
41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%eax,%edi,2),%eax                 \n"
41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm4,%xmm7                        \n"
41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %xmm5,%xmm4                     \n"
41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %xmm5,%xmm7                     \n"
41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm7,%xmm5                        \n"
41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%eax),%xmm6                       \n"
41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%eax,%edi,1),%xmm7                \n"
42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%eax,%edi,2),%eax                 \n"
42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm5,(%esp)                       \n"
42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "neg    %edi                               \n"
42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm6,%xmm5                        \n"
42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %xmm7,%xmm6                     \n"
42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %xmm7,%xmm5                     \n"
42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm5,%xmm7                        \n"
42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    0x10(%eax,%edi,8),%eax             \n"
42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "neg    %edi                               \n"
42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm0,%xmm5                        \n"
43033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %xmm2,%xmm0                     \n"
43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %xmm2,%xmm5                     \n"
43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm5,%xmm2                        \n"
43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm1,%xmm5                        \n"
43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %xmm3,%xmm1                     \n"
43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %xmm3,%xmm5                     \n"
43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm5,%xmm3                        \n"
43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm4,%xmm5                        \n"
43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %xmm6,%xmm4                     \n"
43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %xmm6,%xmm5                     \n"
44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm5,%xmm6                        \n"
44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%esp),%xmm5                       \n"
44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm6,(%esp)                       \n"
44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm5,%xmm6                        \n"
44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %xmm7,%xmm5                     \n"
44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %xmm7,%xmm6                     \n"
44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm6,%xmm7                        \n"
44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm0,%xmm6                        \n"
44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq %xmm4,%xmm0                     \n"
44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhdq %xmm4,%xmm6                     \n"
45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm6,%xmm4                        \n"
45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa (%esp),%xmm6                       \n"
45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd %xmm0,(%edx)                       \n"
45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd %xmm0,(%ebx)                       \n"
45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd %xmm4,(%edx,%esi,1)                \n"
45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%edx,%esi,2),%edx                 \n"
45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd %xmm4,(%ebx,%ebp,1)                \n"
45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%ebx,%ebp,2),%ebx                 \n"
45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm2,%xmm0                        \n"
45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq %xmm6,%xmm2                     \n"
46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd %xmm2,(%edx)                       \n"
46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd %xmm2,(%ebx)                       \n"
46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhdq %xmm6,%xmm0                     \n"
46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd %xmm0,(%edx,%esi,1)                \n"
46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%edx,%esi,2),%edx                 \n"
46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%ebx,%ebp,2),%ebx                 \n"
46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm1,%xmm0                        \n"
46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq %xmm5,%xmm1                     \n"
46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd %xmm1,(%edx)                       \n"
47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd %xmm1,(%ebx)                       \n"
47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhdq %xmm5,%xmm0                     \n"
47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd %xmm0,(%edx,%esi,1)                \n"
47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%edx,%esi,2),%edx                 \n"
47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%ebx,%ebp,2),%ebx                 \n"
47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa %xmm3,%xmm0                        \n"
47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq %xmm7,%xmm3                     \n"
47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd %xmm3,(%edx)                       \n"
47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd %xmm3,(%ebx)                       \n"
48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhdq %xmm7,%xmm0                     \n"
48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub    $0x8,%ecx                          \n"
48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd %xmm0,(%edx,%esi,1)                \n"
48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%edx,%esi,2),%edx                 \n"
48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd %xmm0,(%ebx,%ebp,1)                \n"
48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea    (%ebx,%ebp,2),%ebx                 \n"
48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg     1b                                 \n"
48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov    0x10(%esp),%esp                    \n"
48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pop    %ebp                               \n"
48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pop    %edi                               \n"
49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pop    %esi                               \n"
49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pop    %ebx                               \n"
49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "ret                                       \n"
4937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde);
49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
4957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
4967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define HAS_TRANSPOSE_WX8_FAST_SSSE3
4977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
4987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                    uint8* dst, int dst_stride, int width) {
49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
5007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Read in the data from the source pointer.
5017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // First round of bit swap.
50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  ".p2align  4                                 \n"
50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp"1:                                            \n"
50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0),%%xmm0                      \n"
50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0,%3),%%xmm1                   \n"
50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%0,%3,2),%0                     \n"
50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm0,%%xmm8                    \n"
50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklbw  %%xmm1,%%xmm0                    \n"
50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhbw  %%xmm1,%%xmm8                    \n"
51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0),%%xmm2                      \n"
51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm0,%%xmm1                    \n"
51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm9                    \n"
51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm1,%%xmm1               \n"
51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm9,%%xmm9               \n"
51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0,%3),%%xmm3                   \n"
51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%0,%3,2),%0                     \n"
51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm2,%%xmm10                   \n"
51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklbw  %%xmm3,%%xmm2                    \n"
51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhbw  %%xmm3,%%xmm10                   \n"
52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm2,%%xmm3                    \n"
52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm10,%%xmm11                  \n"
52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0),%%xmm4                      \n"
52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm3,%%xmm3               \n"
52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm11,%%xmm11             \n"
52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0,%3),%%xmm5                   \n"
52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%0,%3,2),%0                     \n"
52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm4,%%xmm12                   \n"
52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklbw  %%xmm5,%%xmm4                    \n"
52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhbw  %%xmm5,%%xmm12                   \n"
53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm4,%%xmm5                    \n"
53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm12,%%xmm13                  \n"
53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0),%%xmm6                      \n"
53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm5,%%xmm5               \n"
53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm13,%%xmm13             \n"
53533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0,%3),%%xmm7                   \n"
53633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%0,%3,2),%0                     \n"
53733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm6,%%xmm14                   \n"
53833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklbw  %%xmm7,%%xmm6                    \n"
53933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhbw  %%xmm7,%%xmm14                   \n"
54033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "neg        %3                               \n"
54133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm6,%%xmm7                    \n"
54233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm14,%%xmm15                  \n"
54333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        0x10(%0,%3,8),%0                 \n"
54433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm7,%%xmm7               \n"
54533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm15,%%xmm15             \n"
54633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "neg        %3                               \n"
5477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde   // Second round of bit swap.
54833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm2,%%xmm0                    \n"
54933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm3,%%xmm1                    \n"
55033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm0,%%xmm2                    \n"
55133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm1,%%xmm3                    \n"
55233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm2,%%xmm2               \n"
55333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm3,%%xmm3               \n"
55433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm6,%%xmm4                    \n"
55533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm7,%%xmm5                    \n"
55633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm4,%%xmm6                    \n"
55733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm5,%%xmm7                    \n"
55833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm6,%%xmm6               \n"
55933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm7,%%xmm7               \n"
56033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm10,%%xmm8                   \n"
56133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm11,%%xmm9                   \n"
56233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm10                   \n"
56333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm9,%%xmm11                   \n"
56433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm10,%%xmm10             \n"
56533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm11,%%xmm11             \n"
56633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm14,%%xmm12                  \n"
56733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm15,%%xmm13                  \n"
56833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm12,%%xmm14                  \n"
56933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm13,%%xmm15                  \n"
57033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm14,%%xmm14             \n"
57133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm15,%%xmm15             \n"
5727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Third round of bit swap.
5737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Write to the destination pointer.
57433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm4,%%xmm0                    \n"
57533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm0,(%1)                      \n"
57633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm0,%%xmm4                    \n"
57733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm4,%%xmm4               \n"
57833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm4,(%1,%4)                   \n"
57933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%4,2),%1                     \n"
58033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm6,%%xmm2                    \n"
58133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm2,%%xmm6                    \n"
58233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm2,(%1)                      \n"
58333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm6,%%xmm6               \n"
58433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm5,%%xmm1                    \n"
58533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm6,(%1,%4)                   \n"
58633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%4,2),%1                     \n"
58733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm1,%%xmm5                    \n"
58833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm1,(%1)                      \n"
58933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm5,%%xmm5               \n"
59033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm5,(%1,%4)                   \n"
59133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%4,2),%1                     \n"
59233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm7,%%xmm3                    \n"
59333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm3,(%1)                      \n"
59433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm3,%%xmm7                    \n"
59533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm7,%%xmm7               \n"
59633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm7,(%1,%4)                   \n"
59733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%4,2),%1                     \n"
59833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm12,%%xmm8                   \n"
59933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm8,(%1)                      \n"
60033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm12                   \n"
60133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm12,%%xmm12             \n"
60233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm12,(%1,%4)                  \n"
60333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%4,2),%1                     \n"
60433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm14,%%xmm10                  \n"
60533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm10,%%xmm14                  \n"
60633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm10,(%1)                     \n"
60733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm14,%%xmm14             \n"
60833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm13,%%xmm9                   \n"
60933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm14,(%1,%4)                  \n"
61033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%4,2),%1                     \n"
61133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm9,%%xmm13                   \n"
61233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm9,(%1)                      \n"
61333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm13,%%xmm13             \n"
61433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm13,(%1,%4)                  \n"
61533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%4,2),%1                     \n"
61633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm15,%%xmm11                  \n"
61733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm11,(%1)                     \n"
61833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm11,%%xmm15                  \n"
61933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "palignr    $0x8,%%xmm15,%%xmm15             \n"
62033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "sub        $0x10,%2                         \n"
62133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movq       %%xmm15,(%1,%4)                  \n"
62233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%4,2),%1                     \n"
62333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "jg         1b                               \n"
6247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  : "+r"(src),    // %0
6257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(dst),    // %1
6267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(width)   // %2
6277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  : "r"(static_cast<intptr_t>(src_stride)),  // %3
6287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "r"(static_cast<intptr_t>(dst_stride))   // %4
62933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc",
63033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
63133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
6327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde);
6337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
6347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define HAS_TRANSPOSE_UVWX8_SSE2
6367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
6377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                uint8* dst_a, int dst_stride_a,
6387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                uint8* dst_b, int dst_stride_b,
6397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                int w) {
64033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
6417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Read in the data from the source pointer.
6427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // First round of bit swap.
64333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  ".p2align  4                                 \n"
64433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp"1:                                            \n"
64533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0),%%xmm0                      \n"
64633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0,%4),%%xmm1                   \n"
64733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%0,%4,2),%0                     \n"
64833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm0,%%xmm8                    \n"
64933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklbw  %%xmm1,%%xmm0                    \n"
65033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhbw  %%xmm1,%%xmm8                    \n"
65133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm1                    \n"
65233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0),%%xmm2                      \n"
65333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0,%4),%%xmm3                   \n"
65433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%0,%4,2),%0                     \n"
65533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm2,%%xmm8                    \n"
65633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklbw  %%xmm3,%%xmm2                    \n"
65733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhbw  %%xmm3,%%xmm8                    \n"
65833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm3                    \n"
65933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0),%%xmm4                      \n"
66033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0,%4),%%xmm5                   \n"
66133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%0,%4,2),%0                     \n"
66233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm4,%%xmm8                    \n"
66333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklbw  %%xmm5,%%xmm4                    \n"
66433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhbw  %%xmm5,%%xmm8                    \n"
66533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm5                    \n"
66633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0),%%xmm6                      \n"
66733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     (%0,%4),%%xmm7                   \n"
66833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%0,%4,2),%0                     \n"
66933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm6,%%xmm8                    \n"
67033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklbw  %%xmm7,%%xmm6                    \n"
67133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "neg        %4                               \n"
67233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        0x10(%0,%4,8),%0                 \n"
67333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhbw  %%xmm7,%%xmm8                    \n"
67433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm7                    \n"
67533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "neg        %4                               \n"
6767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde   // Second round of bit swap.
67733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm0,%%xmm8                    \n"
67833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm1,%%xmm9                    \n"
67933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhwd  %%xmm2,%%xmm8                    \n"
68033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhwd  %%xmm3,%%xmm9                    \n"
68133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm2,%%xmm0                    \n"
68233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm3,%%xmm1                    \n"
68333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm2                    \n"
68433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm9,%%xmm3                    \n"
68533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm4,%%xmm8                    \n"
68633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm5,%%xmm9                    \n"
68733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhwd  %%xmm6,%%xmm8                    \n"
68833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhwd  %%xmm7,%%xmm9                    \n"
68933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm6,%%xmm4                    \n"
69033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpcklwd  %%xmm7,%%xmm5                    \n"
69133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm8,%%xmm6                    \n"
69233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm9,%%xmm7                    \n"
6937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Third round of bit swap.
6947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Write to the destination pointer.
69533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm0,%%xmm8                    \n"
69633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm4,%%xmm0                    \n"
69733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
69833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
69933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhdq  %%xmm4,%%xmm8                    \n"
70033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movlpd     %%xmm8,(%1,%5)                   \n"
70133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%5,2),%1                     \n"
70233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movhpd     %%xmm8,(%2,%6)                   \n"
70333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%2,%6,2),%2                     \n"
70433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm2,%%xmm8                    \n"
70533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm6,%%xmm2                    \n"
70633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movlpd     %%xmm2,(%1)                      \n"
70733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movhpd     %%xmm2,(%2)                      \n"
70833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhdq  %%xmm6,%%xmm8                    \n"
70933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movlpd     %%xmm8,(%1,%5)                   \n"
71033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%5,2),%1                     \n"
71133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movhpd     %%xmm8,(%2,%6)                   \n"
71233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%2,%6,2),%2                     \n"
71333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm1,%%xmm8                    \n"
71433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm5,%%xmm1                    \n"
71533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movlpd     %%xmm1,(%1)                      \n"
71633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movhpd     %%xmm1,(%2)                      \n"
71733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhdq  %%xmm5,%%xmm8                    \n"
71833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movlpd     %%xmm8,(%1,%5)                   \n"
71933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%5,2),%1                     \n"
72033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movhpd     %%xmm8,(%2,%6)                   \n"
72133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%2,%6,2),%2                     \n"
72233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movdqa     %%xmm3,%%xmm8                    \n"
72333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckldq  %%xmm7,%%xmm3                    \n"
72433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movlpd     %%xmm3,(%1)                      \n"
72533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movhpd     %%xmm3,(%2)                      \n"
72633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "punpckhdq  %%xmm7,%%xmm8                    \n"
72733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "sub        $0x8,%3                          \n"
72833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movlpd     %%xmm8,(%1,%5)                   \n"
72933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%1,%5,2),%1                     \n"
73033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "movhpd     %%xmm8,(%2,%6)                   \n"
73133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "lea        (%2,%6,2),%2                     \n"
73233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "jg         1b                               \n"
7337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  : "+r"(src),    // %0
7347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(dst_a),  // %1
7357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(dst_b),  // %2
7367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(w)   // %3
7377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  : "r"(static_cast<intptr_t>(src_stride)),    // %4
7387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "r"(static_cast<intptr_t>(dst_stride_a)),  // %5
7397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "r"(static_cast<intptr_t>(dst_stride_b))   // %6
74033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc",
74133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
74233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "xmm8", "xmm9"
7437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde);
7447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
7457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
7467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
7477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
7487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeWx8_C(const uint8* src, int src_stride,
7497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                           uint8* dst, int dst_stride,
75033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                           int width) {
75133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < width; ++i) {
7527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst[0] = src[0 * src_stride];
7537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst[1] = src[1 * src_stride];
7547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst[2] = src[2 * src_stride];
7557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst[3] = src[3 * src_stride];
7567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst[4] = src[4 * src_stride];
7577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst[5] = src[5 * src_stride];
7587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst[6] = src[6 * src_stride];
7597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst[7] = src[7 * src_stride];
7607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ++src;
7617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst += dst_stride;
7627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
7637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
7647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
7657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeWxH_C(const uint8* src, int src_stride,
7667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                           uint8* dst, int dst_stride,
7677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                           int width, int height) {
76833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < width; ++i) {
76933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    for (int j = 0; j < height; ++j) {
7707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      dst[i * dst_stride + j] = src[j * src_stride + i];
77133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    }
77233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
7737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
7747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
77533cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
7767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid TransposePlane(const uint8* src, int src_stride,
7777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    uint8* dst, int dst_stride,
7787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    int width, int height) {
77933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  void (*TransposeWx8)(const uint8* src, int src_stride,
78033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* dst, int dst_stride,
78133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       int width) = TransposeWx8_C;
7827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#if defined(HAS_TRANSPOSE_WX8_NEON)
78333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasNEON)) {
7847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    TransposeWx8 = TransposeWx8_NEON;
78533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
7867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
7877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#if defined(HAS_TRANSPOSE_WX8_SSSE3)
78833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
7897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    TransposeWx8 = TransposeWx8_SSSE3;
79033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
7917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
79233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
79333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSSE3) &&
79433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(width, 16) &&
79533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
79633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    TransposeWx8 = TransposeWx8_FAST_SSSE3;
7977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
79833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
7997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
80033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // Work across the source in 8x8 tiles
80133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int i = height;
8027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  while (i >= 8) {
8037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    TransposeWx8(src, src_stride, dst, dst_stride, width);
80433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src += 8 * src_stride;    // Go down 8 rows.
80533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    dst += 8;                 // Move over 8 columns.
80633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    i -= 8;
8077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
8087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
80933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
8107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
8117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
81233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
8137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RotatePlane90(const uint8* src, int src_stride,
8147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                   uint8* dst, int dst_stride,
8157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                   int width, int height) {
8167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Rotate by 90 is a transpose with the source read
81733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // from bottom to top. So set the source pointer to the end
8187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // of the buffer and flip the sign of the source stride.
8197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  src += src_stride * (height - 1);
8207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  src_stride = -src_stride;
8217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  TransposePlane(src, src_stride, dst, dst_stride, width, height);
8227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
8237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
82433cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
8257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RotatePlane270(const uint8* src, int src_stride,
8267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    uint8* dst, int dst_stride,
8277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    int width, int height) {
8287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Rotate by 270 is a transpose with the destination written
82933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // from bottom to top. So set the destination pointer to the end
8307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // of the buffer and flip the sign of the destination stride.
8317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  dst += dst_stride * (width - 1);
8327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  dst_stride = -dst_stride;
8337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  TransposePlane(src, src_stride, dst, dst_stride, width, height);
8347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
8357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
83633cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
8377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RotatePlane180(const uint8* src, int src_stride,
8387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    uint8* dst, int dst_stride,
8397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    int width, int height) {
84033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
84133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_MIRRORROW_NEON)
84233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasNEON)) {
84333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    MirrorRow = MirrorRow_NEON;
84433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
8457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
84633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_MIRRORROW_SSE2)
84733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSE2) &&
84833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(width, 16) &&
84933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
85033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
85133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    MirrorRow = MirrorRow_SSE2;
85233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
8537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
85433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_MIRRORROW_SSSE3)
85533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSSE3) &&
85633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(width, 16) &&
85733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
85833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
85933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    MirrorRow = MirrorRow_SSSE3;
8607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
86133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
86233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
86333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_COPYROW_NEON)
86433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) {
86533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    CopyRow = CopyRow_NEON;
86633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
86733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
86833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_COPYROW_X86)
86933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
87033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    CopyRow = CopyRow_X86;
87133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
87233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
87333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_COPYROW_SSE2)
87433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
87533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
87633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
87733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    CopyRow = CopyRow_SSE2;
87833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
87933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
88033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (width > kMaxStride) {
88133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    return;
88233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
88333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // Swap first and last row and mirror the content. Uses a temporary row.
88433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  SIMD_ALIGNED(uint8 row[kMaxStride]);
88533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  const uint8* src_bot = src + src_stride * (height - 1);
88633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uint8* dst_bot = dst + dst_stride * (height - 1);
88733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int half_height = (height + 1) >> 1;
88833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // Odd height will harmlessly mirror the middle row twice.
88933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int y = 0; y < half_height; ++y) {
89033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    MirrorRow(src, row, width);  // Mirror first row into a buffer
89133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src += src_stride;
89233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    MirrorRow(src_bot, dst, width);  // Mirror last row into first row
8937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst += dst_stride;
89433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    CopyRow(row, dst_bot, width);  // Copy first mirrored row into last
89533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src_bot -= src_stride;
89633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    dst_bot -= dst_stride;
8977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
8987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
8997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
9007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeUVWx8_C(const uint8* src, int src_stride,
9017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             uint8* dst_a, int dst_stride_a,
9027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             uint8* dst_b, int dst_stride_b,
90333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                             int width) {
90433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < width; ++i) {
9057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a[0] = src[0 * src_stride + 0];
9067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b[0] = src[0 * src_stride + 1];
9077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a[1] = src[1 * src_stride + 0];
9087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b[1] = src[1 * src_stride + 1];
9097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a[2] = src[2 * src_stride + 0];
9107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b[2] = src[2 * src_stride + 1];
9117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a[3] = src[3 * src_stride + 0];
9127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b[3] = src[3 * src_stride + 1];
9137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a[4] = src[4 * src_stride + 0];
9147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b[4] = src[4 * src_stride + 1];
9157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a[5] = src[5 * src_stride + 0];
9167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b[5] = src[5 * src_stride + 1];
9177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a[6] = src[6 * src_stride + 0];
9187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b[6] = src[6 * src_stride + 1];
9197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a[7] = src[7 * src_stride + 0];
9207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b[7] = src[7 * src_stride + 1];
9217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src += 2;
9227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_a += dst_stride_a;
9237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    dst_b += dst_stride_b;
9247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
9257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
9267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
9277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordestatic void TransposeUVWxH_C(const uint8* src, int src_stride,
9287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             uint8* dst_a, int dst_stride_a,
9297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             uint8* dst_b, int dst_stride_b,
93033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                             int width, int height) {
93133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < width * 2; i += 2)
93233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    for (int j = 0; j < height; ++j) {
9337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
9347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
9357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    }
9367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
9377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
93833cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
9397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid TransposeUV(const uint8* src, int src_stride,
9407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 uint8* dst_a, int dst_stride_a,
9417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 uint8* dst_b, int dst_stride_b,
9427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 int width, int height) {
94333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  void (*TransposeUVWx8)(const uint8* src, int src_stride,
94433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* dst_a, int dst_stride_a,
94533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* dst_b, int dst_stride_b,
94633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) = TransposeUVWx8_C;
9477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#if defined(HAS_TRANSPOSE_UVWX8_NEON)
94833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasNEON)) {
94933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    TransposeUVWx8 = TransposeUVWx8_NEON;
95033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
95133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
95233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSE2) &&
95333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(width, 8) &&
95433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
95533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    TransposeUVWx8 = TransposeUVWx8_SSE2;
9567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
95733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
9587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
95933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // Work through the source in 8x8 tiles.
96033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  int i = height;
9617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  while (i >= 8) {
96233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    TransposeUVWx8(src, src_stride,
96333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                   dst_a, dst_stride_a,
96433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                   dst_b, dst_stride_b,
96533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                   width);
96633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src += 8 * src_stride;    // Go down 8 rows.
96733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    dst_a += 8;               // Move over 8 columns.
96833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    dst_b += 8;               // Move over 8 columns.
96933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    i -= 8;
9707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
9717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
97233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  TransposeUVWxH_C(src, src_stride,
97333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                   dst_a, dst_stride_a,
97433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                   dst_b, dst_stride_b,
97533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                   width, i);
9767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
9777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
97833cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
9797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RotateUV90(const uint8* src, int src_stride,
9807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                uint8* dst_a, int dst_stride_a,
9817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                uint8* dst_b, int dst_stride_b,
9827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                int width, int height) {
9837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  src += src_stride * (height - 1);
9847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  src_stride = -src_stride;
9857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
9867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  TransposeUV(src, src_stride,
9877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde              dst_a, dst_stride_a,
9887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde              dst_b, dst_stride_b,
9897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde              width, height);
9907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
9917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
99233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
9937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RotateUV270(const uint8* src, int src_stride,
9947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 uint8* dst_a, int dst_stride_a,
9957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 uint8* dst_b, int dst_stride_b,
9967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 int width, int height) {
9977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  dst_a += dst_stride_a * (width - 1);
9987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  dst_b += dst_stride_b * (width - 1);
9997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  dst_stride_a = -dst_stride_a;
10007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  dst_stride_b = -dst_stride_b;
10017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
10027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  TransposeUV(src, src_stride,
10037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde              dst_a, dst_stride_a,
10047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde              dst_b, dst_stride_b,
10057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde              width, height);
10067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
10077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
100833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Rotate 180 is a horizontal and vertical flip.
100933cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
10107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RotateUV180(const uint8* src, int src_stride,
10117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 uint8* dst_a, int dst_stride_a,
10127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 uint8* dst_b, int dst_stride_b,
10137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 int width, int height) {
101433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
101533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      MirrorRowUV_C;
101633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(HAS_MIRRORROW_UV_NEON)
101733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasNEON)) {
101833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    MirrorRowUV = MirrorRowUV_NEON;
10197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
102033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#elif defined(HAS_MIRRORROW_UV_SSSE3)
102133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (TestCpuFlag(kCpuHasSSSE3) &&
102233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(width, 16) &&
102333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
102433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    MirrorRowUV = MirrorRowUV_SSSE3;
102533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
102633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
10277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
10287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  dst_a += dst_stride_a * (height - 1);
10297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  dst_b += dst_stride_b * (height - 1);
10307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
103133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int i = 0; i < height; ++i) {
103233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    MirrorRowUV(src, dst_a, dst_b, width);
103333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    src += src_stride;
103433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    dst_a -= dst_stride_a;
103533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    dst_b -= dst_stride_b;
10367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
10377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
10387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
103933cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
10407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeint I420Rotate(const uint8* src_y, int src_stride_y,
10417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde               const uint8* src_u, int src_stride_u,
10427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde               const uint8* src_v, int src_stride_v,
10437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde               uint8* dst_y, int dst_stride_y,
10447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde               uint8* dst_u, int dst_stride_u,
10457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde               uint8* dst_v, int dst_stride_v,
10467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde               int width, int height,
10477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde               RotationMode mode) {
104833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
104933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      !dst_y || !dst_u || !dst_v) {
105033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    return -1;
105133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
10527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  int halfwidth = (width + 1) >> 1;
10537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  int halfheight = (height + 1) >> 1;
10547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
10557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Negative height means invert the image.
10567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  if (height < 0) {
10577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    height = -height;
10587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    halfheight = (height + 1) >> 1;
10597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_y = src_y + (height - 1) * src_stride_y;
10607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_u = src_u + (halfheight - 1) * src_stride_u;
10617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_v = src_v + (halfheight - 1) * src_stride_v;
10627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_stride_y = -src_stride_y;
10637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_stride_u = -src_stride_u;
10647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_stride_v = -src_stride_v;
10657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
10667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
10677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  switch (mode) {
10687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    case kRotate0:
10697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      // copy frame
10707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      return I420Copy(src_y, src_stride_y,
10717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                      src_u, src_stride_u,
10727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                      src_v, src_stride_v,
10737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                      dst_y, dst_stride_y,
10747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                      dst_u, dst_stride_u,
10757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                      dst_v, dst_stride_v,
10767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                      width, height);
10777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    case kRotate90:
10787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane90(src_y, src_stride_y,
10797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    dst_y, dst_stride_y,
10807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    width, height);
10817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane90(src_u, src_stride_u,
10827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    dst_u, dst_stride_u,
10837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    halfwidth, halfheight);
10847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane90(src_v, src_stride_v,
10857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    dst_v, dst_stride_v,
10867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    halfwidth, halfheight);
10877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      return 0;
10887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    case kRotate270:
10897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane270(src_y, src_stride_y,
10907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     dst_y, dst_stride_y,
10917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     width, height);
10927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane270(src_u, src_stride_u,
10937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     dst_u, dst_stride_u,
10947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     halfwidth, halfheight);
10957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane270(src_v, src_stride_v,
10967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     dst_v, dst_stride_v,
10977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     halfwidth, halfheight);
10987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      return 0;
10997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    case kRotate180:
11007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane180(src_y, src_stride_y,
11017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     dst_y, dst_stride_y,
11027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     width, height);
11037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane180(src_u, src_stride_u,
11047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     dst_u, dst_stride_u,
11057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     halfwidth, halfheight);
11067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane180(src_v, src_stride_v,
11077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     dst_v, dst_stride_v,
11087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     halfwidth, halfheight);
11097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      return 0;
11107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    default:
11117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      break;
11127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
11137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  return -1;
11147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
11157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
111633cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
11177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeint NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
11187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     const uint8* src_uv, int src_stride_uv,
11197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     uint8* dst_y, int dst_stride_y,
11207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     uint8* dst_u, int dst_stride_u,
11217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     uint8* dst_v, int dst_stride_v,
11227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     int width, int height,
11237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     RotationMode mode) {
112433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  if (!src_y || !src_uv || width <= 0 || height == 0 ||
112533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      !dst_y || !dst_u || !dst_v) {
112633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    return -1;
112733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
11287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  int halfwidth = (width + 1) >> 1;
11297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  int halfheight = (height + 1) >> 1;
11307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
11317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  // Negative height means invert the image.
11327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  if (height < 0) {
11337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    height = -height;
11347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    halfheight = (height + 1) >> 1;
11357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_y = src_y + (height - 1) * src_stride_y;
11367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
11377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_stride_y = -src_stride_y;
11387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    src_stride_uv = -src_stride_uv;
11397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
11407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
11417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  switch (mode) {
11427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    case kRotate0:
11437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      // copy frame
114433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      return NV12ToI420(src_y, src_stride_y,
114533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        src_uv, src_stride_uv,
11467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                        dst_y, dst_stride_y,
11477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                        dst_u, dst_stride_u,
11487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                        dst_v, dst_stride_v,
11497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                        width, height);
11507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    case kRotate90:
11517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane90(src_y, src_stride_y,
11527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    dst_y, dst_stride_y,
11537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                    width, height);
11547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotateUV90(src_uv, src_stride_uv,
11557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 dst_u, dst_stride_u,
11567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 dst_v, dst_stride_v,
11577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                 halfwidth, halfheight);
11587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      return 0;
11597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    case kRotate270:
11607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane270(src_y, src_stride_y,
11617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     dst_y, dst_stride_y,
11627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     width, height);
11637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotateUV270(src_uv, src_stride_uv,
11647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                  dst_u, dst_stride_u,
11657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                  dst_v, dst_stride_v,
11667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                  halfwidth, halfheight);
11677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      return 0;
11687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    case kRotate180:
11697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotatePlane180(src_y, src_stride_y,
11707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     dst_y, dst_stride_y,
11717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                     width, height);
11727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      RotateUV180(src_uv, src_stride_uv,
11737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                  dst_u, dst_stride_u,
11747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                  dst_v, dst_stride_v,
11757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                  halfwidth, halfheight);
11767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      return 0;
11777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    default:
11787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde      break;
11797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
11807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  return -1;
11817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
11827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
118333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
118433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // extern "C"
11857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}  // namespace libyuv
118633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
1187