1/* 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/rotate.h" 12 13#include "libyuv/cpu_id.h" 14#include "libyuv/convert.h" 15#include "libyuv/planar_functions.h" 16#include "libyuv/row.h" 17 18#ifdef __cplusplus 19namespace libyuv { 20extern "C" { 21#endif 22 23#if !defined(LIBYUV_DISABLE_X86) && \ 24 (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) 25#if defined(__APPLE__) && defined(__i386__) 26#define DECLARE_FUNCTION(name) \ 27 ".text \n" \ 28 ".private_extern _" #name " \n" \ 29 ".align 4,0x90 \n" \ 30"_" #name ": \n" 31#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) 32#define DECLARE_FUNCTION(name) \ 33 ".text \n" \ 34 ".align 4,0x90 \n" \ 35"_" #name ": \n" 36#else 37#define DECLARE_FUNCTION(name) \ 38 ".text \n" \ 39 ".align 4,0x90 \n" \ 40#name ": \n" 41#endif 42#endif 43 44#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ 45 (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) 46#define HAS_MIRRORROW_NEON 47void MirrorRow_NEON(const uint8* src, uint8* dst, int width); 48#define HAS_MIRRORROW_UV_NEON 49void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width); 50#define HAS_TRANSPOSE_WX8_NEON 51void TransposeWx8_NEON(const uint8* src, int src_stride, 52 uint8* dst, int dst_stride, int width); 53#define HAS_TRANSPOSE_UVWX8_NEON 54void TransposeUVWx8_NEON(const uint8* src, int src_stride, 55 uint8* dst_a, int dst_stride_a, 56 uint8* dst_b, int dst_stride_b, 57 int width); 58#endif // defined(__ARM_NEON__) 59 60#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ 61 defined(__mips__) && \ 62 defined(__mips_dsp) && (__mips_dsp_rev >= 2) 63#define HAS_TRANSPOSE_WX8_MIPS_DSPR2 64void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, 65 uint8* dst, int dst_stride, int width); 66 67void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, 68 uint8* dst, int dst_stride, int width); 69#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2 70void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, 71 uint8* dst_a, int dst_stride_a, 72 uint8* dst_b, int dst_stride_b, 73 int width); 74#endif // defined(__mips__) 75 76#if !defined(LIBYUV_DISABLE_X86) && \ 77 defined(_M_IX86) && defined(_MSC_VER) 78#define HAS_TRANSPOSE_WX8_SSSE3 79__declspec(naked) __declspec(align(16)) 80static void TransposeWx8_SSSE3(const uint8* src, int src_stride, 81 uint8* dst, int dst_stride, int width) { 82 __asm { 83 push edi 84 push esi 85 push ebp 86 mov eax, [esp + 12 + 4] // src 87 mov edi, [esp + 12 + 8] // src_stride 88 mov edx, [esp + 12 + 12] // dst 89 mov esi, [esp + 12 + 16] // dst_stride 90 mov ecx, [esp + 12 + 20] // width 91 92 // Read in the data from the source pointer. 93 // First round of bit swap. 94 align 4 95 convertloop: 96 movq xmm0, qword ptr [eax] 97 lea ebp, [eax + 8] 98 movq xmm1, qword ptr [eax + edi] 99 lea eax, [eax + 2 * edi] 100 punpcklbw xmm0, xmm1 101 movq xmm2, qword ptr [eax] 102 movdqa xmm1, xmm0 103 palignr xmm1, xmm1, 8 104 movq xmm3, qword ptr [eax + edi] 105 lea eax, [eax + 2 * edi] 106 punpcklbw xmm2, xmm3 107 movdqa xmm3, xmm2 108 movq xmm4, qword ptr [eax] 109 palignr xmm3, xmm3, 8 110 movq xmm5, qword ptr [eax + edi] 111 punpcklbw xmm4, xmm5 112 lea eax, [eax + 2 * edi] 113 movdqa xmm5, xmm4 114 movq xmm6, qword ptr [eax] 115 palignr xmm5, xmm5, 8 116 movq xmm7, qword ptr [eax + edi] 117 punpcklbw xmm6, xmm7 118 mov eax, ebp 119 movdqa xmm7, xmm6 120 palignr xmm7, xmm7, 8 121 // Second round of bit swap. 122 punpcklwd xmm0, xmm2 123 punpcklwd xmm1, xmm3 124 movdqa xmm2, xmm0 125 movdqa xmm3, xmm1 126 palignr xmm2, xmm2, 8 127 palignr xmm3, xmm3, 8 128 punpcklwd xmm4, xmm6 129 punpcklwd xmm5, xmm7 130 movdqa xmm6, xmm4 131 movdqa xmm7, xmm5 132 palignr xmm6, xmm6, 8 133 palignr xmm7, xmm7, 8 134 // Third round of bit swap. 135 // Write to the destination pointer. 136 punpckldq xmm0, xmm4 137 movq qword ptr [edx], xmm0 138 movdqa xmm4, xmm0 139 palignr xmm4, xmm4, 8 140 movq qword ptr [edx + esi], xmm4 141 lea edx, [edx + 2 * esi] 142 punpckldq xmm2, xmm6 143 movdqa xmm6, xmm2 144 palignr xmm6, xmm6, 8 145 movq qword ptr [edx], xmm2 146 punpckldq xmm1, xmm5 147 movq qword ptr [edx + esi], xmm6 148 lea edx, [edx + 2 * esi] 149 movdqa xmm5, xmm1 150 movq qword ptr [edx], xmm1 151 palignr xmm5, xmm5, 8 152 punpckldq xmm3, xmm7 153 movq qword ptr [edx + esi], xmm5 154 lea edx, [edx + 2 * esi] 155 movq qword ptr [edx], xmm3 156 movdqa xmm7, xmm3 157 palignr xmm7, xmm7, 8 158 sub ecx, 8 159 movq qword ptr [edx + esi], xmm7 160 lea edx, [edx + 2 * esi] 161 jg convertloop 162 163 pop ebp 164 pop esi 165 pop edi 166 ret 167 } 168} 169 170#define HAS_TRANSPOSE_UVWX8_SSE2 171__declspec(naked) __declspec(align(16)) 172static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 173 uint8* dst_a, int dst_stride_a, 174 uint8* dst_b, int dst_stride_b, 175 int w) { 176 __asm { 177 push ebx 178 push esi 179 push edi 180 push ebp 181 mov eax, [esp + 16 + 4] // src 182 mov edi, [esp + 16 + 8] // src_stride 183 mov edx, [esp + 16 + 12] // dst_a 184 mov esi, [esp + 16 + 16] // dst_stride_a 185 mov ebx, [esp + 16 + 20] // dst_b 186 mov ebp, [esp + 16 + 24] // dst_stride_b 187 mov ecx, esp 188 sub esp, 4 + 16 189 and esp, ~15 190 mov [esp + 16], ecx 191 mov ecx, [ecx + 16 + 28] // w 192 193 align 4 194 convertloop: 195 // Read in the data from the source pointer. 196 // First round of bit swap. 197 movdqa xmm0, [eax] 198 movdqa xmm1, [eax + edi] 199 lea eax, [eax + 2 * edi] 200 movdqa xmm7, xmm0 // use xmm7 as temp register. 201 punpcklbw xmm0, xmm1 202 punpckhbw xmm7, xmm1 203 movdqa xmm1, xmm7 204 movdqa xmm2, [eax] 205 movdqa xmm3, [eax + edi] 206 lea eax, [eax + 2 * edi] 207 movdqa xmm7, xmm2 208 punpcklbw xmm2, xmm3 209 punpckhbw xmm7, xmm3 210 movdqa xmm3, xmm7 211 movdqa xmm4, [eax] 212 movdqa xmm5, [eax + edi] 213 lea eax, [eax + 2 * edi] 214 movdqa xmm7, xmm4 215 punpcklbw xmm4, xmm5 216 punpckhbw xmm7, xmm5 217 movdqa xmm5, xmm7 218 movdqa xmm6, [eax] 219 movdqa xmm7, [eax + edi] 220 lea eax, [eax + 2 * edi] 221 movdqa [esp], xmm5 // backup xmm5 222 neg edi 223 movdqa xmm5, xmm6 // use xmm5 as temp register. 224 punpcklbw xmm6, xmm7 225 punpckhbw xmm5, xmm7 226 movdqa xmm7, xmm5 227 lea eax, [eax + 8 * edi + 16] 228 neg edi 229 // Second round of bit swap. 230 movdqa xmm5, xmm0 231 punpcklwd xmm0, xmm2 232 punpckhwd xmm5, xmm2 233 movdqa xmm2, xmm5 234 movdqa xmm5, xmm1 235 punpcklwd xmm1, xmm3 236 punpckhwd xmm5, xmm3 237 movdqa xmm3, xmm5 238 movdqa xmm5, xmm4 239 punpcklwd xmm4, xmm6 240 punpckhwd xmm5, xmm6 241 movdqa xmm6, xmm5 242 movdqa xmm5, [esp] // restore xmm5 243 movdqa [esp], xmm6 // backup xmm6 244 movdqa xmm6, xmm5 // use xmm6 as temp register. 245 punpcklwd xmm5, xmm7 246 punpckhwd xmm6, xmm7 247 movdqa xmm7, xmm6 248 // Third round of bit swap. 249 // Write to the destination pointer. 250 movdqa xmm6, xmm0 251 punpckldq xmm0, xmm4 252 punpckhdq xmm6, xmm4 253 movdqa xmm4, xmm6 254 movdqa xmm6, [esp] // restore xmm6 255 movlpd qword ptr [edx], xmm0 256 movhpd qword ptr [ebx], xmm0 257 movlpd qword ptr [edx + esi], xmm4 258 lea edx, [edx + 2 * esi] 259 movhpd qword ptr [ebx + ebp], xmm4 260 lea ebx, [ebx + 2 * ebp] 261 movdqa xmm0, xmm2 // use xmm0 as the temp register. 262 punpckldq xmm2, xmm6 263 movlpd qword ptr [edx], xmm2 264 movhpd qword ptr [ebx], xmm2 265 punpckhdq xmm0, xmm6 266 movlpd qword ptr [edx + esi], xmm0 267 lea edx, [edx + 2 * esi] 268 movhpd qword ptr [ebx + ebp], xmm0 269 lea ebx, [ebx + 2 * ebp] 270 movdqa xmm0, xmm1 // use xmm0 as the temp register. 271 punpckldq xmm1, xmm5 272 movlpd qword ptr [edx], xmm1 273 movhpd qword ptr [ebx], xmm1 274 punpckhdq xmm0, xmm5 275 movlpd qword ptr [edx + esi], xmm0 276 lea edx, [edx + 2 * esi] 277 movhpd qword ptr [ebx + ebp], xmm0 278 lea ebx, [ebx + 2 * ebp] 279 movdqa xmm0, xmm3 // use xmm0 as the temp register. 280 punpckldq xmm3, xmm7 281 movlpd qword ptr [edx], xmm3 282 movhpd qword ptr [ebx], xmm3 283 punpckhdq xmm0, xmm7 284 sub ecx, 8 285 movlpd qword ptr [edx + esi], xmm0 286 lea edx, [edx + 2 * esi] 287 movhpd qword ptr [ebx + ebp], xmm0 288 lea ebx, [ebx + 2 * ebp] 289 jg convertloop 290 291 mov esp, [esp + 16] 292 pop ebp 293 pop edi 294 pop esi 295 pop ebx 296 ret 297 } 298} 299#elif !defined(LIBYUV_DISABLE_X86) && \ 300 (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) 301#define HAS_TRANSPOSE_WX8_SSSE3 302static void TransposeWx8_SSSE3(const uint8* src, int src_stride, 303 uint8* dst, int dst_stride, int width) { 304 asm volatile ( 305 // Read in the data from the source pointer. 306 // First round of bit swap. 307 ".p2align 2 \n" 308 "1: \n" 309 "movq (%0),%%xmm0 \n" 310 "movq (%0,%3),%%xmm1 \n" 311 "lea (%0,%3,2),%0 \n" 312 "punpcklbw %%xmm1,%%xmm0 \n" 313 "movq (%0),%%xmm2 \n" 314 "movdqa %%xmm0,%%xmm1 \n" 315 "palignr $0x8,%%xmm1,%%xmm1 \n" 316 "movq (%0,%3),%%xmm3 \n" 317 "lea (%0,%3,2),%0 \n" 318 "punpcklbw %%xmm3,%%xmm2 \n" 319 "movdqa %%xmm2,%%xmm3 \n" 320 "movq (%0),%%xmm4 \n" 321 "palignr $0x8,%%xmm3,%%xmm3 \n" 322 "movq (%0,%3),%%xmm5 \n" 323 "lea (%0,%3,2),%0 \n" 324 "punpcklbw %%xmm5,%%xmm4 \n" 325 "movdqa %%xmm4,%%xmm5 \n" 326 "movq (%0),%%xmm6 \n" 327 "palignr $0x8,%%xmm5,%%xmm5 \n" 328 "movq (%0,%3),%%xmm7 \n" 329 "lea (%0,%3,2),%0 \n" 330 "punpcklbw %%xmm7,%%xmm6 \n" 331 "neg %3 \n" 332 "movdqa %%xmm6,%%xmm7 \n" 333 "lea 0x8(%0,%3,8),%0 \n" 334 "palignr $0x8,%%xmm7,%%xmm7 \n" 335 "neg %3 \n" 336 // Second round of bit swap. 337 "punpcklwd %%xmm2,%%xmm0 \n" 338 "punpcklwd %%xmm3,%%xmm1 \n" 339 "movdqa %%xmm0,%%xmm2 \n" 340 "movdqa %%xmm1,%%xmm3 \n" 341 "palignr $0x8,%%xmm2,%%xmm2 \n" 342 "palignr $0x8,%%xmm3,%%xmm3 \n" 343 "punpcklwd %%xmm6,%%xmm4 \n" 344 "punpcklwd %%xmm7,%%xmm5 \n" 345 "movdqa %%xmm4,%%xmm6 \n" 346 "movdqa %%xmm5,%%xmm7 \n" 347 "palignr $0x8,%%xmm6,%%xmm6 \n" 348 "palignr $0x8,%%xmm7,%%xmm7 \n" 349 // Third round of bit swap. 350 // Write to the destination pointer. 351 "punpckldq %%xmm4,%%xmm0 \n" 352 "movq %%xmm0,(%1) \n" 353 "movdqa %%xmm0,%%xmm4 \n" 354 "palignr $0x8,%%xmm4,%%xmm4 \n" 355 "movq %%xmm4,(%1,%4) \n" 356 "lea (%1,%4,2),%1 \n" 357 "punpckldq %%xmm6,%%xmm2 \n" 358 "movdqa %%xmm2,%%xmm6 \n" 359 "movq %%xmm2,(%1) \n" 360 "palignr $0x8,%%xmm6,%%xmm6 \n" 361 "punpckldq %%xmm5,%%xmm1 \n" 362 "movq %%xmm6,(%1,%4) \n" 363 "lea (%1,%4,2),%1 \n" 364 "movdqa %%xmm1,%%xmm5 \n" 365 "movq %%xmm1,(%1) \n" 366 "palignr $0x8,%%xmm5,%%xmm5 \n" 367 "movq %%xmm5,(%1,%4) \n" 368 "lea (%1,%4,2),%1 \n" 369 "punpckldq %%xmm7,%%xmm3 \n" 370 "movq %%xmm3,(%1) \n" 371 "movdqa %%xmm3,%%xmm7 \n" 372 "palignr $0x8,%%xmm7,%%xmm7 \n" 373 "sub $0x8,%2 \n" 374 "movq %%xmm7,(%1,%4) \n" 375 "lea (%1,%4,2),%1 \n" 376 "jg 1b \n" 377 : "+r"(src), // %0 378 "+r"(dst), // %1 379 "+r"(width) // %2 380 : "r"((intptr_t)(src_stride)), // %3 381 "r"((intptr_t)(dst_stride)) // %4 382 : "memory", "cc" 383 #if defined(__SSE2__) 384 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 385 #endif 386 ); 387} 388 389#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) 390#define HAS_TRANSPOSE_UVWX8_SSE2 391void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 392 uint8* dst_a, int dst_stride_a, 393 uint8* dst_b, int dst_stride_b, 394 int w); 395 asm ( 396 DECLARE_FUNCTION(TransposeUVWx8_SSE2) 397 "push %ebx \n" 398 "push %esi \n" 399 "push %edi \n" 400 "push %ebp \n" 401 "mov 0x14(%esp),%eax \n" 402 "mov 0x18(%esp),%edi \n" 403 "mov 0x1c(%esp),%edx \n" 404 "mov 0x20(%esp),%esi \n" 405 "mov 0x24(%esp),%ebx \n" 406 "mov 0x28(%esp),%ebp \n" 407 "mov %esp,%ecx \n" 408 "sub $0x14,%esp \n" 409 "and $0xfffffff0,%esp \n" 410 "mov %ecx,0x10(%esp) \n" 411 "mov 0x2c(%ecx),%ecx \n" 412 413"1: \n" 414 "movdqa (%eax),%xmm0 \n" 415 "movdqa (%eax,%edi,1),%xmm1 \n" 416 "lea (%eax,%edi,2),%eax \n" 417 "movdqa %xmm0,%xmm7 \n" 418 "punpcklbw %xmm1,%xmm0 \n" 419 "punpckhbw %xmm1,%xmm7 \n" 420 "movdqa %xmm7,%xmm1 \n" 421 "movdqa (%eax),%xmm2 \n" 422 "movdqa (%eax,%edi,1),%xmm3 \n" 423 "lea (%eax,%edi,2),%eax \n" 424 "movdqa %xmm2,%xmm7 \n" 425 "punpcklbw %xmm3,%xmm2 \n" 426 "punpckhbw %xmm3,%xmm7 \n" 427 "movdqa %xmm7,%xmm3 \n" 428 "movdqa (%eax),%xmm4 \n" 429 "movdqa (%eax,%edi,1),%xmm5 \n" 430 "lea (%eax,%edi,2),%eax \n" 431 "movdqa %xmm4,%xmm7 \n" 432 "punpcklbw %xmm5,%xmm4 \n" 433 "punpckhbw %xmm5,%xmm7 \n" 434 "movdqa %xmm7,%xmm5 \n" 435 "movdqa (%eax),%xmm6 \n" 436 "movdqa (%eax,%edi,1),%xmm7 \n" 437 "lea (%eax,%edi,2),%eax \n" 438 "movdqa %xmm5,(%esp) \n" 439 "neg %edi \n" 440 "movdqa %xmm6,%xmm5 \n" 441 "punpcklbw %xmm7,%xmm6 \n" 442 "punpckhbw %xmm7,%xmm5 \n" 443 "movdqa %xmm5,%xmm7 \n" 444 "lea 0x10(%eax,%edi,8),%eax \n" 445 "neg %edi \n" 446 "movdqa %xmm0,%xmm5 \n" 447 "punpcklwd %xmm2,%xmm0 \n" 448 "punpckhwd %xmm2,%xmm5 \n" 449 "movdqa %xmm5,%xmm2 \n" 450 "movdqa %xmm1,%xmm5 \n" 451 "punpcklwd %xmm3,%xmm1 \n" 452 "punpckhwd %xmm3,%xmm5 \n" 453 "movdqa %xmm5,%xmm3 \n" 454 "movdqa %xmm4,%xmm5 \n" 455 "punpcklwd %xmm6,%xmm4 \n" 456 "punpckhwd %xmm6,%xmm5 \n" 457 "movdqa %xmm5,%xmm6 \n" 458 "movdqa (%esp),%xmm5 \n" 459 "movdqa %xmm6,(%esp) \n" 460 "movdqa %xmm5,%xmm6 \n" 461 "punpcklwd %xmm7,%xmm5 \n" 462 "punpckhwd %xmm7,%xmm6 \n" 463 "movdqa %xmm6,%xmm7 \n" 464 "movdqa %xmm0,%xmm6 \n" 465 "punpckldq %xmm4,%xmm0 \n" 466 "punpckhdq %xmm4,%xmm6 \n" 467 "movdqa %xmm6,%xmm4 \n" 468 "movdqa (%esp),%xmm6 \n" 469 "movlpd %xmm0,(%edx) \n" 470 "movhpd %xmm0,(%ebx) \n" 471 "movlpd %xmm4,(%edx,%esi,1) \n" 472 "lea (%edx,%esi,2),%edx \n" 473 "movhpd %xmm4,(%ebx,%ebp,1) \n" 474 "lea (%ebx,%ebp,2),%ebx \n" 475 "movdqa %xmm2,%xmm0 \n" 476 "punpckldq %xmm6,%xmm2 \n" 477 "movlpd %xmm2,(%edx) \n" 478 "movhpd %xmm2,(%ebx) \n" 479 "punpckhdq %xmm6,%xmm0 \n" 480 "movlpd %xmm0,(%edx,%esi,1) \n" 481 "lea (%edx,%esi,2),%edx \n" 482 "movhpd %xmm0,(%ebx,%ebp,1) \n" 483 "lea (%ebx,%ebp,2),%ebx \n" 484 "movdqa %xmm1,%xmm0 \n" 485 "punpckldq %xmm5,%xmm1 \n" 486 "movlpd %xmm1,(%edx) \n" 487 "movhpd %xmm1,(%ebx) \n" 488 "punpckhdq %xmm5,%xmm0 \n" 489 "movlpd %xmm0,(%edx,%esi,1) \n" 490 "lea (%edx,%esi,2),%edx \n" 491 "movhpd %xmm0,(%ebx,%ebp,1) \n" 492 "lea (%ebx,%ebp,2),%ebx \n" 493 "movdqa %xmm3,%xmm0 \n" 494 "punpckldq %xmm7,%xmm3 \n" 495 "movlpd %xmm3,(%edx) \n" 496 "movhpd %xmm3,(%ebx) \n" 497 "punpckhdq %xmm7,%xmm0 \n" 498 "sub $0x8,%ecx \n" 499 "movlpd %xmm0,(%edx,%esi,1) \n" 500 "lea (%edx,%esi,2),%edx \n" 501 "movhpd %xmm0,(%ebx,%ebp,1) \n" 502 "lea (%ebx,%ebp,2),%ebx \n" 503 "jg 1b \n" 504 "mov 0x10(%esp),%esp \n" 505 "pop %ebp \n" 506 "pop %edi \n" 507 "pop %esi \n" 508 "pop %ebx \n" 509#if defined(__native_client__) 510 "pop %ecx \n" 511 "and $0xffffffe0,%ecx \n" 512 "jmp *%ecx \n" 513#else 514 "ret \n" 515#endif 516); 517#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ 518 defined(__x86_64__) 519// 64 bit version has enough registers to do 16x8 to 8x16 at a time. 520#define HAS_TRANSPOSE_WX8_FAST_SSSE3 521static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, 522 uint8* dst, int dst_stride, int width) { 523 asm volatile ( 524 // Read in the data from the source pointer. 525 // First round of bit swap. 526 ".p2align 2 \n" 527"1: \n" 528 "movdqa (%0),%%xmm0 \n" 529 "movdqa (%0,%3),%%xmm1 \n" 530 "lea (%0,%3,2),%0 \n" 531 "movdqa %%xmm0,%%xmm8 \n" 532 "punpcklbw %%xmm1,%%xmm0 \n" 533 "punpckhbw %%xmm1,%%xmm8 \n" 534 "movdqa (%0),%%xmm2 \n" 535 "movdqa %%xmm0,%%xmm1 \n" 536 "movdqa %%xmm8,%%xmm9 \n" 537 "palignr $0x8,%%xmm1,%%xmm1 \n" 538 "palignr $0x8,%%xmm9,%%xmm9 \n" 539 "movdqa (%0,%3),%%xmm3 \n" 540 "lea (%0,%3,2),%0 \n" 541 "movdqa %%xmm2,%%xmm10 \n" 542 "punpcklbw %%xmm3,%%xmm2 \n" 543 "punpckhbw %%xmm3,%%xmm10 \n" 544 "movdqa %%xmm2,%%xmm3 \n" 545 "movdqa %%xmm10,%%xmm11 \n" 546 "movdqa (%0),%%xmm4 \n" 547 "palignr $0x8,%%xmm3,%%xmm3 \n" 548 "palignr $0x8,%%xmm11,%%xmm11 \n" 549 "movdqa (%0,%3),%%xmm5 \n" 550 "lea (%0,%3,2),%0 \n" 551 "movdqa %%xmm4,%%xmm12 \n" 552 "punpcklbw %%xmm5,%%xmm4 \n" 553 "punpckhbw %%xmm5,%%xmm12 \n" 554 "movdqa %%xmm4,%%xmm5 \n" 555 "movdqa %%xmm12,%%xmm13 \n" 556 "movdqa (%0),%%xmm6 \n" 557 "palignr $0x8,%%xmm5,%%xmm5 \n" 558 "palignr $0x8,%%xmm13,%%xmm13 \n" 559 "movdqa (%0,%3),%%xmm7 \n" 560 "lea (%0,%3,2),%0 \n" 561 "movdqa %%xmm6,%%xmm14 \n" 562 "punpcklbw %%xmm7,%%xmm6 \n" 563 "punpckhbw %%xmm7,%%xmm14 \n" 564 "neg %3 \n" 565 "movdqa %%xmm6,%%xmm7 \n" 566 "movdqa %%xmm14,%%xmm15 \n" 567 "lea 0x10(%0,%3,8),%0 \n" 568 "palignr $0x8,%%xmm7,%%xmm7 \n" 569 "palignr $0x8,%%xmm15,%%xmm15 \n" 570 "neg %3 \n" 571 // Second round of bit swap. 572 "punpcklwd %%xmm2,%%xmm0 \n" 573 "punpcklwd %%xmm3,%%xmm1 \n" 574 "movdqa %%xmm0,%%xmm2 \n" 575 "movdqa %%xmm1,%%xmm3 \n" 576 "palignr $0x8,%%xmm2,%%xmm2 \n" 577 "palignr $0x8,%%xmm3,%%xmm3 \n" 578 "punpcklwd %%xmm6,%%xmm4 \n" 579 "punpcklwd %%xmm7,%%xmm5 \n" 580 "movdqa %%xmm4,%%xmm6 \n" 581 "movdqa %%xmm5,%%xmm7 \n" 582 "palignr $0x8,%%xmm6,%%xmm6 \n" 583 "palignr $0x8,%%xmm7,%%xmm7 \n" 584 "punpcklwd %%xmm10,%%xmm8 \n" 585 "punpcklwd %%xmm11,%%xmm9 \n" 586 "movdqa %%xmm8,%%xmm10 \n" 587 "movdqa %%xmm9,%%xmm11 \n" 588 "palignr $0x8,%%xmm10,%%xmm10 \n" 589 "palignr $0x8,%%xmm11,%%xmm11 \n" 590 "punpcklwd %%xmm14,%%xmm12 \n" 591 "punpcklwd %%xmm15,%%xmm13 \n" 592 "movdqa %%xmm12,%%xmm14 \n" 593 "movdqa %%xmm13,%%xmm15 \n" 594 "palignr $0x8,%%xmm14,%%xmm14 \n" 595 "palignr $0x8,%%xmm15,%%xmm15 \n" 596 // Third round of bit swap. 597 // Write to the destination pointer. 598 "punpckldq %%xmm4,%%xmm0 \n" 599 "movq %%xmm0,(%1) \n" 600 "movdqa %%xmm0,%%xmm4 \n" 601 "palignr $0x8,%%xmm4,%%xmm4 \n" 602 "movq %%xmm4,(%1,%4) \n" 603 "lea (%1,%4,2),%1 \n" 604 "punpckldq %%xmm6,%%xmm2 \n" 605 "movdqa %%xmm2,%%xmm6 \n" 606 "movq %%xmm2,(%1) \n" 607 "palignr $0x8,%%xmm6,%%xmm6 \n" 608 "punpckldq %%xmm5,%%xmm1 \n" 609 "movq %%xmm6,(%1,%4) \n" 610 "lea (%1,%4,2),%1 \n" 611 "movdqa %%xmm1,%%xmm5 \n" 612 "movq %%xmm1,(%1) \n" 613 "palignr $0x8,%%xmm5,%%xmm5 \n" 614 "movq %%xmm5,(%1,%4) \n" 615 "lea (%1,%4,2),%1 \n" 616 "punpckldq %%xmm7,%%xmm3 \n" 617 "movq %%xmm3,(%1) \n" 618 "movdqa %%xmm3,%%xmm7 \n" 619 "palignr $0x8,%%xmm7,%%xmm7 \n" 620 "movq %%xmm7,(%1,%4) \n" 621 "lea (%1,%4,2),%1 \n" 622 "punpckldq %%xmm12,%%xmm8 \n" 623 "movq %%xmm8,(%1) \n" 624 "movdqa %%xmm8,%%xmm12 \n" 625 "palignr $0x8,%%xmm12,%%xmm12 \n" 626 "movq %%xmm12,(%1,%4) \n" 627 "lea (%1,%4,2),%1 \n" 628 "punpckldq %%xmm14,%%xmm10 \n" 629 "movdqa %%xmm10,%%xmm14 \n" 630 "movq %%xmm10,(%1) \n" 631 "palignr $0x8,%%xmm14,%%xmm14 \n" 632 "punpckldq %%xmm13,%%xmm9 \n" 633 "movq %%xmm14,(%1,%4) \n" 634 "lea (%1,%4,2),%1 \n" 635 "movdqa %%xmm9,%%xmm13 \n" 636 "movq %%xmm9,(%1) \n" 637 "palignr $0x8,%%xmm13,%%xmm13 \n" 638 "movq %%xmm13,(%1,%4) \n" 639 "lea (%1,%4,2),%1 \n" 640 "punpckldq %%xmm15,%%xmm11 \n" 641 "movq %%xmm11,(%1) \n" 642 "movdqa %%xmm11,%%xmm15 \n" 643 "palignr $0x8,%%xmm15,%%xmm15 \n" 644 "sub $0x10,%2 \n" 645 "movq %%xmm15,(%1,%4) \n" 646 "lea (%1,%4,2),%1 \n" 647 "jg 1b \n" 648 : "+r"(src), // %0 649 "+r"(dst), // %1 650 "+r"(width) // %2 651 : "r"((intptr_t)(src_stride)), // %3 652 "r"((intptr_t)(dst_stride)) // %4 653 : "memory", "cc", 654 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 655 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" 656); 657} 658 659#define HAS_TRANSPOSE_UVWX8_SSE2 660static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 661 uint8* dst_a, int dst_stride_a, 662 uint8* dst_b, int dst_stride_b, 663 int w) { 664 asm volatile ( 665 // Read in the data from the source pointer. 666 // First round of bit swap. 667 ".p2align 2 \n" 668"1: \n" 669 "movdqa (%0),%%xmm0 \n" 670 "movdqa (%0,%4),%%xmm1 \n" 671 "lea (%0,%4,2),%0 \n" 672 "movdqa %%xmm0,%%xmm8 \n" 673 "punpcklbw %%xmm1,%%xmm0 \n" 674 "punpckhbw %%xmm1,%%xmm8 \n" 675 "movdqa %%xmm8,%%xmm1 \n" 676 "movdqa (%0),%%xmm2 \n" 677 "movdqa (%0,%4),%%xmm3 \n" 678 "lea (%0,%4,2),%0 \n" 679 "movdqa %%xmm2,%%xmm8 \n" 680 "punpcklbw %%xmm3,%%xmm2 \n" 681 "punpckhbw %%xmm3,%%xmm8 \n" 682 "movdqa %%xmm8,%%xmm3 \n" 683 "movdqa (%0),%%xmm4 \n" 684 "movdqa (%0,%4),%%xmm5 \n" 685 "lea (%0,%4,2),%0 \n" 686 "movdqa %%xmm4,%%xmm8 \n" 687 "punpcklbw %%xmm5,%%xmm4 \n" 688 "punpckhbw %%xmm5,%%xmm8 \n" 689 "movdqa %%xmm8,%%xmm5 \n" 690 "movdqa (%0),%%xmm6 \n" 691 "movdqa (%0,%4),%%xmm7 \n" 692 "lea (%0,%4,2),%0 \n" 693 "movdqa %%xmm6,%%xmm8 \n" 694 "punpcklbw %%xmm7,%%xmm6 \n" 695 "neg %4 \n" 696 "lea 0x10(%0,%4,8),%0 \n" 697 "punpckhbw %%xmm7,%%xmm8 \n" 698 "movdqa %%xmm8,%%xmm7 \n" 699 "neg %4 \n" 700 // Second round of bit swap. 701 "movdqa %%xmm0,%%xmm8 \n" 702 "movdqa %%xmm1,%%xmm9 \n" 703 "punpckhwd %%xmm2,%%xmm8 \n" 704 "punpckhwd %%xmm3,%%xmm9 \n" 705 "punpcklwd %%xmm2,%%xmm0 \n" 706 "punpcklwd %%xmm3,%%xmm1 \n" 707 "movdqa %%xmm8,%%xmm2 \n" 708 "movdqa %%xmm9,%%xmm3 \n" 709 "movdqa %%xmm4,%%xmm8 \n" 710 "movdqa %%xmm5,%%xmm9 \n" 711 "punpckhwd %%xmm6,%%xmm8 \n" 712 "punpckhwd %%xmm7,%%xmm9 \n" 713 "punpcklwd %%xmm6,%%xmm4 \n" 714 "punpcklwd %%xmm7,%%xmm5 \n" 715 "movdqa %%xmm8,%%xmm6 \n" 716 "movdqa %%xmm9,%%xmm7 \n" 717 // Third round of bit swap. 718 // Write to the destination pointer. 719 "movdqa %%xmm0,%%xmm8 \n" 720 "punpckldq %%xmm4,%%xmm0 \n" 721 "movlpd %%xmm0,(%1) \n" // Write back U channel 722 "movhpd %%xmm0,(%2) \n" // Write back V channel 723 "punpckhdq %%xmm4,%%xmm8 \n" 724 "movlpd %%xmm8,(%1,%5) \n" 725 "lea (%1,%5,2),%1 \n" 726 "movhpd %%xmm8,(%2,%6) \n" 727 "lea (%2,%6,2),%2 \n" 728 "movdqa %%xmm2,%%xmm8 \n" 729 "punpckldq %%xmm6,%%xmm2 \n" 730 "movlpd %%xmm2,(%1) \n" 731 "movhpd %%xmm2,(%2) \n" 732 "punpckhdq %%xmm6,%%xmm8 \n" 733 "movlpd %%xmm8,(%1,%5) \n" 734 "lea (%1,%5,2),%1 \n" 735 "movhpd %%xmm8,(%2,%6) \n" 736 "lea (%2,%6,2),%2 \n" 737 "movdqa %%xmm1,%%xmm8 \n" 738 "punpckldq %%xmm5,%%xmm1 \n" 739 "movlpd %%xmm1,(%1) \n" 740 "movhpd %%xmm1,(%2) \n" 741 "punpckhdq %%xmm5,%%xmm8 \n" 742 "movlpd %%xmm8,(%1,%5) \n" 743 "lea (%1,%5,2),%1 \n" 744 "movhpd %%xmm8,(%2,%6) \n" 745 "lea (%2,%6,2),%2 \n" 746 "movdqa %%xmm3,%%xmm8 \n" 747 "punpckldq %%xmm7,%%xmm3 \n" 748 "movlpd %%xmm3,(%1) \n" 749 "movhpd %%xmm3,(%2) \n" 750 "punpckhdq %%xmm7,%%xmm8 \n" 751 "sub $0x8,%3 \n" 752 "movlpd %%xmm8,(%1,%5) \n" 753 "lea (%1,%5,2),%1 \n" 754 "movhpd %%xmm8,(%2,%6) \n" 755 "lea (%2,%6,2),%2 \n" 756 "jg 1b \n" 757 : "+r"(src), // %0 758 "+r"(dst_a), // %1 759 "+r"(dst_b), // %2 760 "+r"(w) // %3 761 : "r"((intptr_t)(src_stride)), // %4 762 "r"((intptr_t)(dst_stride_a)), // %5 763 "r"((intptr_t)(dst_stride_b)) // %6 764 : "memory", "cc", 765 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 766 "xmm8", "xmm9" 767); 768} 769#endif 770#endif 771 772static void TransposeWx8_C(const uint8* src, int src_stride, 773 uint8* dst, int dst_stride, 774 int width) { 775 int i; 776 for (i = 0; i < width; ++i) { 777 dst[0] = src[0 * src_stride]; 778 dst[1] = src[1 * src_stride]; 779 dst[2] = src[2 * src_stride]; 780 dst[3] = src[3 * src_stride]; 781 dst[4] = src[4 * src_stride]; 782 dst[5] = src[5 * src_stride]; 783 dst[6] = src[6 * src_stride]; 784 dst[7] = src[7 * src_stride]; 785 ++src; 786 dst += dst_stride; 787 } 788} 789 790static void TransposeWxH_C(const uint8* src, int src_stride, 791 uint8* dst, int dst_stride, 792 int width, int height) { 793 int i; 794 for (i = 0; i < width; ++i) { 795 int j; 796 for (j = 0; j < height; ++j) { 797 dst[i * dst_stride + j] = src[j * src_stride + i]; 798 } 799 } 800} 801 802LIBYUV_API 803void TransposePlane(const uint8* src, int src_stride, 804 uint8* dst, int dst_stride, 805 int width, int height) { 806 int i = height; 807 void (*TransposeWx8)(const uint8* src, int src_stride, 808 uint8* dst, int dst_stride, 809 int width) = TransposeWx8_C; 810#if defined(HAS_TRANSPOSE_WX8_NEON) 811 if (TestCpuFlag(kCpuHasNEON)) { 812 TransposeWx8 = TransposeWx8_NEON; 813 } 814#endif 815#if defined(HAS_TRANSPOSE_WX8_SSSE3) 816 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { 817 TransposeWx8 = TransposeWx8_SSSE3; 818 } 819#endif 820#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) 821 if (TestCpuFlag(kCpuHasSSSE3) && 822 IS_ALIGNED(width, 16) && 823 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 824 TransposeWx8 = TransposeWx8_FAST_SSSE3; 825 } 826#endif 827#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) 828 if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { 829 if (IS_ALIGNED(width, 4) && 830 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { 831 TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; 832 } else { 833 TransposeWx8 = TransposeWx8_MIPS_DSPR2; 834 } 835 } 836#endif 837 838 // Work across the source in 8x8 tiles 839 while (i >= 8) { 840 TransposeWx8(src, src_stride, dst, dst_stride, width); 841 src += 8 * src_stride; // Go down 8 rows. 842 dst += 8; // Move over 8 columns. 843 i -= 8; 844 } 845 846 TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); 847} 848 849LIBYUV_API 850void RotatePlane90(const uint8* src, int src_stride, 851 uint8* dst, int dst_stride, 852 int width, int height) { 853 // Rotate by 90 is a transpose with the source read 854 // from bottom to top. So set the source pointer to the end 855 // of the buffer and flip the sign of the source stride. 856 src += src_stride * (height - 1); 857 src_stride = -src_stride; 858 TransposePlane(src, src_stride, dst, dst_stride, width, height); 859} 860 861LIBYUV_API 862void RotatePlane270(const uint8* src, int src_stride, 863 uint8* dst, int dst_stride, 864 int width, int height) { 865 // Rotate by 270 is a transpose with the destination written 866 // from bottom to top. So set the destination pointer to the end 867 // of the buffer and flip the sign of the destination stride. 868 dst += dst_stride * (width - 1); 869 dst_stride = -dst_stride; 870 TransposePlane(src, src_stride, dst, dst_stride, width, height); 871} 872 873LIBYUV_API 874void RotatePlane180(const uint8* src, int src_stride, 875 uint8* dst, int dst_stride, 876 int width, int height) { 877 // Swap first and last row and mirror the content. Uses a temporary row. 878 align_buffer_64(row, width); 879 const uint8* src_bot = src + src_stride * (height - 1); 880 uint8* dst_bot = dst + dst_stride * (height - 1); 881 int half_height = (height + 1) >> 1; 882 int y; 883 void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C; 884 void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; 885#if defined(HAS_MIRRORROW_NEON) 886 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { 887 MirrorRow = MirrorRow_NEON; 888 } 889#endif 890#if defined(HAS_MIRRORROW_SSE2) 891 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && 892 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 893 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 894 MirrorRow = MirrorRow_SSE2; 895 } 896#endif 897#if defined(HAS_MIRRORROW_SSSE3) 898 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && 899 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 900 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 901 MirrorRow = MirrorRow_SSSE3; 902 } 903#endif 904#if defined(HAS_MIRRORROW_AVX2) 905 if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { 906 MirrorRow = MirrorRow_AVX2; 907 } 908#endif 909#if defined(HAS_MIRRORROW_MIPS_DSPR2) 910 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && 911 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && 912 IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { 913 MirrorRow = MirrorRow_MIPS_DSPR2; 914 } 915#endif 916#if defined(HAS_COPYROW_NEON) 917 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) { 918 CopyRow = CopyRow_NEON; 919 } 920#endif 921#if defined(HAS_COPYROW_X86) 922 if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { 923 CopyRow = CopyRow_X86; 924 } 925#endif 926#if defined(HAS_COPYROW_SSE2) 927 if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && 928 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && 929 IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { 930 CopyRow = CopyRow_SSE2; 931 } 932#endif 933#if defined(HAS_COPYROW_ERMS) 934 if (TestCpuFlag(kCpuHasERMS)) { 935 CopyRow = CopyRow_ERMS; 936 } 937#endif 938#if defined(HAS_COPYROW_MIPS) 939 if (TestCpuFlag(kCpuHasMIPS)) { 940 CopyRow = CopyRow_MIPS; 941 } 942#endif 943 944 // Odd height will harmlessly mirror the middle row twice. 945 for (y = 0; y < half_height; ++y) { 946 MirrorRow(src, row, width); // Mirror first row into a buffer 947 src += src_stride; 948 MirrorRow(src_bot, dst, width); // Mirror last row into first row 949 dst += dst_stride; 950 CopyRow(row, dst_bot, width); // Copy first mirrored row into last 951 src_bot -= src_stride; 952 dst_bot -= dst_stride; 953 } 954 free_aligned_buffer_64(row); 955} 956 957static void TransposeUVWx8_C(const uint8* src, int src_stride, 958 uint8* dst_a, int dst_stride_a, 959 uint8* dst_b, int dst_stride_b, 960 int width) { 961 int i; 962 for (i = 0; i < width; ++i) { 963 dst_a[0] = src[0 * src_stride + 0]; 964 dst_b[0] = src[0 * src_stride + 1]; 965 dst_a[1] = src[1 * src_stride + 0]; 966 dst_b[1] = src[1 * src_stride + 1]; 967 dst_a[2] = src[2 * src_stride + 0]; 968 dst_b[2] = src[2 * src_stride + 1]; 969 dst_a[3] = src[3 * src_stride + 0]; 970 dst_b[3] = src[3 * src_stride + 1]; 971 dst_a[4] = src[4 * src_stride + 0]; 972 dst_b[4] = src[4 * src_stride + 1]; 973 dst_a[5] = src[5 * src_stride + 0]; 974 dst_b[5] = src[5 * src_stride + 1]; 975 dst_a[6] = src[6 * src_stride + 0]; 976 dst_b[6] = src[6 * src_stride + 1]; 977 dst_a[7] = src[7 * src_stride + 0]; 978 dst_b[7] = src[7 * src_stride + 1]; 979 src += 2; 980 dst_a += dst_stride_a; 981 dst_b += dst_stride_b; 982 } 983} 984 985static void TransposeUVWxH_C(const uint8* src, int src_stride, 986 uint8* dst_a, int dst_stride_a, 987 uint8* dst_b, int dst_stride_b, 988 int width, int height) { 989 int i; 990 for (i = 0; i < width * 2; i += 2) { 991 int j; 992 for (j = 0; j < height; ++j) { 993 dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; 994 dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; 995 } 996 } 997} 998 999LIBYUV_API 1000void TransposeUV(const uint8* src, int src_stride, 1001 uint8* dst_a, int dst_stride_a, 1002 uint8* dst_b, int dst_stride_b, 1003 int width, int height) { 1004 int i = height; 1005 void (*TransposeUVWx8)(const uint8* src, int src_stride, 1006 uint8* dst_a, int dst_stride_a, 1007 uint8* dst_b, int dst_stride_b, 1008 int width) = TransposeUVWx8_C; 1009#if defined(HAS_TRANSPOSE_UVWX8_NEON) 1010 if (TestCpuFlag(kCpuHasNEON)) { 1011 TransposeUVWx8 = TransposeUVWx8_NEON; 1012 } 1013#elif defined(HAS_TRANSPOSE_UVWX8_SSE2) 1014 if (TestCpuFlag(kCpuHasSSE2) && 1015 IS_ALIGNED(width, 8) && 1016 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 1017 TransposeUVWx8 = TransposeUVWx8_SSE2; 1018 } 1019#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) 1020 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && 1021 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { 1022 TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; 1023 } 1024#endif 1025 1026 // Work through the source in 8x8 tiles. 1027 while (i >= 8) { 1028 TransposeUVWx8(src, src_stride, 1029 dst_a, dst_stride_a, 1030 dst_b, dst_stride_b, 1031 width); 1032 src += 8 * src_stride; // Go down 8 rows. 1033 dst_a += 8; // Move over 8 columns. 1034 dst_b += 8; // Move over 8 columns. 1035 i -= 8; 1036 } 1037 1038 TransposeUVWxH_C(src, src_stride, 1039 dst_a, dst_stride_a, 1040 dst_b, dst_stride_b, 1041 width, i); 1042} 1043 1044LIBYUV_API 1045void RotateUV90(const uint8* src, int src_stride, 1046 uint8* dst_a, int dst_stride_a, 1047 uint8* dst_b, int dst_stride_b, 1048 int width, int height) { 1049 src += src_stride * (height - 1); 1050 src_stride = -src_stride; 1051 1052 TransposeUV(src, src_stride, 1053 dst_a, dst_stride_a, 1054 dst_b, dst_stride_b, 1055 width, height); 1056} 1057 1058LIBYUV_API 1059void RotateUV270(const uint8* src, int src_stride, 1060 uint8* dst_a, int dst_stride_a, 1061 uint8* dst_b, int dst_stride_b, 1062 int width, int height) { 1063 dst_a += dst_stride_a * (width - 1); 1064 dst_b += dst_stride_b * (width - 1); 1065 dst_stride_a = -dst_stride_a; 1066 dst_stride_b = -dst_stride_b; 1067 1068 TransposeUV(src, src_stride, 1069 dst_a, dst_stride_a, 1070 dst_b, dst_stride_b, 1071 width, height); 1072} 1073 1074// Rotate 180 is a horizontal and vertical flip. 1075LIBYUV_API 1076void RotateUV180(const uint8* src, int src_stride, 1077 uint8* dst_a, int dst_stride_a, 1078 uint8* dst_b, int dst_stride_b, 1079 int width, int height) { 1080 int i; 1081 void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) = 1082 MirrorUVRow_C; 1083#if defined(HAS_MIRRORUVROW_NEON) 1084 if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { 1085 MirrorRowUV = MirrorUVRow_NEON; 1086 } 1087#elif defined(HAS_MIRRORROW_UV_SSSE3) 1088 if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && 1089 IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { 1090 MirrorRowUV = MirrorUVRow_SSSE3; 1091 } 1092#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2) 1093 if (TestCpuFlag(kCpuHasMIPS_DSPR2) && 1094 IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { 1095 MirrorRowUV = MirrorUVRow_MIPS_DSPR2; 1096 } 1097#endif 1098 1099 dst_a += dst_stride_a * (height - 1); 1100 dst_b += dst_stride_b * (height - 1); 1101 1102 for (i = 0; i < height; ++i) { 1103 MirrorRowUV(src, dst_a, dst_b, width); 1104 src += src_stride; 1105 dst_a -= dst_stride_a; 1106 dst_b -= dst_stride_b; 1107 } 1108} 1109 1110LIBYUV_API 1111int RotatePlane(const uint8* src, int src_stride, 1112 uint8* dst, int dst_stride, 1113 int width, int height, 1114 enum RotationMode mode) { 1115 if (!src || width <= 0 || height == 0 || !dst) { 1116 return -1; 1117 } 1118 1119 // Negative height means invert the image. 1120 if (height < 0) { 1121 height = -height; 1122 src = src + (height - 1) * src_stride; 1123 src_stride = -src_stride; 1124 } 1125 1126 switch (mode) { 1127 case kRotate0: 1128 // copy frame 1129 CopyPlane(src, src_stride, 1130 dst, dst_stride, 1131 width, height); 1132 return 0; 1133 case kRotate90: 1134 RotatePlane90(src, src_stride, 1135 dst, dst_stride, 1136 width, height); 1137 return 0; 1138 case kRotate270: 1139 RotatePlane270(src, src_stride, 1140 dst, dst_stride, 1141 width, height); 1142 return 0; 1143 case kRotate180: 1144 RotatePlane180(src, src_stride, 1145 dst, dst_stride, 1146 width, height); 1147 return 0; 1148 default: 1149 break; 1150 } 1151 return -1; 1152} 1153 1154LIBYUV_API 1155int I420Rotate(const uint8* src_y, int src_stride_y, 1156 const uint8* src_u, int src_stride_u, 1157 const uint8* src_v, int src_stride_v, 1158 uint8* dst_y, int dst_stride_y, 1159 uint8* dst_u, int dst_stride_u, 1160 uint8* dst_v, int dst_stride_v, 1161 int width, int height, 1162 enum RotationMode mode) { 1163 int halfwidth = (width + 1) >> 1; 1164 int halfheight = (height + 1) >> 1; 1165 if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || 1166 !dst_y || !dst_u || !dst_v) { 1167 return -1; 1168 } 1169 1170 // Negative height means invert the image. 1171 if (height < 0) { 1172 height = -height; 1173 halfheight = (height + 1) >> 1; 1174 src_y = src_y + (height - 1) * src_stride_y; 1175 src_u = src_u + (halfheight - 1) * src_stride_u; 1176 src_v = src_v + (halfheight - 1) * src_stride_v; 1177 src_stride_y = -src_stride_y; 1178 src_stride_u = -src_stride_u; 1179 src_stride_v = -src_stride_v; 1180 } 1181 1182 switch (mode) { 1183 case kRotate0: 1184 // copy frame 1185 return I420Copy(src_y, src_stride_y, 1186 src_u, src_stride_u, 1187 src_v, src_stride_v, 1188 dst_y, dst_stride_y, 1189 dst_u, dst_stride_u, 1190 dst_v, dst_stride_v, 1191 width, height); 1192 case kRotate90: 1193 RotatePlane90(src_y, src_stride_y, 1194 dst_y, dst_stride_y, 1195 width, height); 1196 RotatePlane90(src_u, src_stride_u, 1197 dst_u, dst_stride_u, 1198 halfwidth, halfheight); 1199 RotatePlane90(src_v, src_stride_v, 1200 dst_v, dst_stride_v, 1201 halfwidth, halfheight); 1202 return 0; 1203 case kRotate270: 1204 RotatePlane270(src_y, src_stride_y, 1205 dst_y, dst_stride_y, 1206 width, height); 1207 RotatePlane270(src_u, src_stride_u, 1208 dst_u, dst_stride_u, 1209 halfwidth, halfheight); 1210 RotatePlane270(src_v, src_stride_v, 1211 dst_v, dst_stride_v, 1212 halfwidth, halfheight); 1213 return 0; 1214 case kRotate180: 1215 RotatePlane180(src_y, src_stride_y, 1216 dst_y, dst_stride_y, 1217 width, height); 1218 RotatePlane180(src_u, src_stride_u, 1219 dst_u, dst_stride_u, 1220 halfwidth, halfheight); 1221 RotatePlane180(src_v, src_stride_v, 1222 dst_v, dst_stride_v, 1223 halfwidth, halfheight); 1224 return 0; 1225 default: 1226 break; 1227 } 1228 return -1; 1229} 1230 1231LIBYUV_API 1232int NV12ToI420Rotate(const uint8* src_y, int src_stride_y, 1233 const uint8* src_uv, int src_stride_uv, 1234 uint8* dst_y, int dst_stride_y, 1235 uint8* dst_u, int dst_stride_u, 1236 uint8* dst_v, int dst_stride_v, 1237 int width, int height, 1238 enum RotationMode mode) { 1239 int halfwidth = (width + 1) >> 1; 1240 int halfheight = (height + 1) >> 1; 1241 if (!src_y || !src_uv || width <= 0 || height == 0 || 1242 !dst_y || !dst_u || !dst_v) { 1243 return -1; 1244 } 1245 1246 // Negative height means invert the image. 1247 if (height < 0) { 1248 height = -height; 1249 halfheight = (height + 1) >> 1; 1250 src_y = src_y + (height - 1) * src_stride_y; 1251 src_uv = src_uv + (halfheight - 1) * src_stride_uv; 1252 src_stride_y = -src_stride_y; 1253 src_stride_uv = -src_stride_uv; 1254 } 1255 1256 switch (mode) { 1257 case kRotate0: 1258 // copy frame 1259 return NV12ToI420(src_y, src_stride_y, 1260 src_uv, src_stride_uv, 1261 dst_y, dst_stride_y, 1262 dst_u, dst_stride_u, 1263 dst_v, dst_stride_v, 1264 width, height); 1265 case kRotate90: 1266 RotatePlane90(src_y, src_stride_y, 1267 dst_y, dst_stride_y, 1268 width, height); 1269 RotateUV90(src_uv, src_stride_uv, 1270 dst_u, dst_stride_u, 1271 dst_v, dst_stride_v, 1272 halfwidth, halfheight); 1273 return 0; 1274 case kRotate270: 1275 RotatePlane270(src_y, src_stride_y, 1276 dst_y, dst_stride_y, 1277 width, height); 1278 RotateUV270(src_uv, src_stride_uv, 1279 dst_u, dst_stride_u, 1280 dst_v, dst_stride_v, 1281 halfwidth, halfheight); 1282 return 0; 1283 case kRotate180: 1284 RotatePlane180(src_y, src_stride_y, 1285 dst_y, dst_stride_y, 1286 width, height); 1287 RotateUV180(src_uv, src_stride_uv, 1288 dst_u, dst_stride_u, 1289 dst_v, dst_stride_v, 1290 halfwidth, halfheight); 1291 return 0; 1292 default: 1293 break; 1294 } 1295 return -1; 1296} 1297 1298#ifdef __cplusplus 1299} // extern "C" 1300} // namespace libyuv 1301#endif 1302