1/* 2 * Copyright 2015 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12#include "libyuv/rotate_row.h" 13 14#ifdef __cplusplus 15namespace libyuv { 16extern "C" { 17#endif 18 19// This module is for GCC x86 and x64. 20#if !defined(LIBYUV_DISABLE_X86) && \ 21 (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) 22 23// Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit. 24#if defined(HAS_TRANSPOSEWX8_SSSE3) 25void TransposeWx8_SSSE3(const uint8* src, int src_stride, 26 uint8* dst, int dst_stride, int width) { 27 asm volatile ( 28 // Read in the data from the source pointer. 29 // First round of bit swap. 30 LABELALIGN 31 "1: \n" 32 "movq (%0),%%xmm0 \n" 33 "movq (%0,%3),%%xmm1 \n" 34 "lea (%0,%3,2),%0 \n" 35 "punpcklbw %%xmm1,%%xmm0 \n" 36 "movq (%0),%%xmm2 \n" 37 "movdqa %%xmm0,%%xmm1 \n" 38 "palignr $0x8,%%xmm1,%%xmm1 \n" 39 "movq (%0,%3),%%xmm3 \n" 40 "lea (%0,%3,2),%0 \n" 41 "punpcklbw %%xmm3,%%xmm2 \n" 42 "movdqa %%xmm2,%%xmm3 \n" 43 "movq (%0),%%xmm4 \n" 44 "palignr $0x8,%%xmm3,%%xmm3 \n" 45 "movq (%0,%3),%%xmm5 \n" 46 "lea (%0,%3,2),%0 \n" 47 "punpcklbw %%xmm5,%%xmm4 \n" 48 "movdqa %%xmm4,%%xmm5 \n" 49 "movq (%0),%%xmm6 \n" 50 "palignr $0x8,%%xmm5,%%xmm5 \n" 51 "movq (%0,%3),%%xmm7 \n" 52 "lea (%0,%3,2),%0 \n" 53 "punpcklbw %%xmm7,%%xmm6 \n" 54 "neg %3 \n" 55 "movdqa %%xmm6,%%xmm7 \n" 56 "lea 0x8(%0,%3,8),%0 \n" 57 "palignr $0x8,%%xmm7,%%xmm7 \n" 58 "neg %3 \n" 59 // Second round of bit swap. 60 "punpcklwd %%xmm2,%%xmm0 \n" 61 "punpcklwd %%xmm3,%%xmm1 \n" 62 "movdqa %%xmm0,%%xmm2 \n" 63 "movdqa %%xmm1,%%xmm3 \n" 64 "palignr $0x8,%%xmm2,%%xmm2 \n" 65 "palignr $0x8,%%xmm3,%%xmm3 \n" 66 "punpcklwd %%xmm6,%%xmm4 \n" 67 "punpcklwd %%xmm7,%%xmm5 \n" 68 "movdqa %%xmm4,%%xmm6 \n" 69 "movdqa %%xmm5,%%xmm7 \n" 70 "palignr $0x8,%%xmm6,%%xmm6 \n" 71 "palignr $0x8,%%xmm7,%%xmm7 \n" 72 // Third round of bit swap. 73 // Write to the destination pointer. 74 "punpckldq %%xmm4,%%xmm0 \n" 75 "movq %%xmm0,(%1) \n" 76 "movdqa %%xmm0,%%xmm4 \n" 77 "palignr $0x8,%%xmm4,%%xmm4 \n" 78 "movq %%xmm4,(%1,%4) \n" 79 "lea (%1,%4,2),%1 \n" 80 "punpckldq %%xmm6,%%xmm2 \n" 81 "movdqa %%xmm2,%%xmm6 \n" 82 "movq %%xmm2,(%1) \n" 83 "palignr $0x8,%%xmm6,%%xmm6 \n" 84 "punpckldq %%xmm5,%%xmm1 \n" 85 "movq %%xmm6,(%1,%4) \n" 86 "lea (%1,%4,2),%1 \n" 87 "movdqa %%xmm1,%%xmm5 \n" 88 "movq %%xmm1,(%1) \n" 89 "palignr $0x8,%%xmm5,%%xmm5 \n" 90 "movq %%xmm5,(%1,%4) \n" 91 "lea (%1,%4,2),%1 \n" 92 "punpckldq %%xmm7,%%xmm3 \n" 93 "movq %%xmm3,(%1) \n" 94 "movdqa %%xmm3,%%xmm7 \n" 95 "palignr $0x8,%%xmm7,%%xmm7 \n" 96 "sub $0x8,%2 \n" 97 "movq %%xmm7,(%1,%4) \n" 98 "lea (%1,%4,2),%1 \n" 99 "jg 1b \n" 100 : "+r"(src), // %0 101 "+r"(dst), // %1 102 "+r"(width) // %2 103 : "r"((intptr_t)(src_stride)), // %3 104 "r"((intptr_t)(dst_stride)) // %4 105 : "memory", "cc", 106 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 107 ); 108} 109#endif // defined(HAS_TRANSPOSEWX8_SSSE3) 110 111// Transpose 16x8. 64 bit 112#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) 113void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, 114 uint8* dst, int dst_stride, int width) { 115 asm volatile ( 116 // Read in the data from the source pointer. 117 // First round of bit swap. 118 LABELALIGN 119 "1: \n" 120 "movdqu (%0),%%xmm0 \n" 121 "movdqu (%0,%3),%%xmm1 \n" 122 "lea (%0,%3,2),%0 \n" 123 "movdqa %%xmm0,%%xmm8 \n" 124 "punpcklbw %%xmm1,%%xmm0 \n" 125 "punpckhbw %%xmm1,%%xmm8 \n" 126 "movdqu (%0),%%xmm2 \n" 127 "movdqa %%xmm0,%%xmm1 \n" 128 "movdqa %%xmm8,%%xmm9 \n" 129 "palignr $0x8,%%xmm1,%%xmm1 \n" 130 "palignr $0x8,%%xmm9,%%xmm9 \n" 131 "movdqu (%0,%3),%%xmm3 \n" 132 "lea (%0,%3,2),%0 \n" 133 "movdqa %%xmm2,%%xmm10 \n" 134 "punpcklbw %%xmm3,%%xmm2 \n" 135 "punpckhbw %%xmm3,%%xmm10 \n" 136 "movdqa %%xmm2,%%xmm3 \n" 137 "movdqa %%xmm10,%%xmm11 \n" 138 "movdqu (%0),%%xmm4 \n" 139 "palignr $0x8,%%xmm3,%%xmm3 \n" 140 "palignr $0x8,%%xmm11,%%xmm11 \n" 141 "movdqu (%0,%3),%%xmm5 \n" 142 "lea (%0,%3,2),%0 \n" 143 "movdqa %%xmm4,%%xmm12 \n" 144 "punpcklbw %%xmm5,%%xmm4 \n" 145 "punpckhbw %%xmm5,%%xmm12 \n" 146 "movdqa %%xmm4,%%xmm5 \n" 147 "movdqa %%xmm12,%%xmm13 \n" 148 "movdqu (%0),%%xmm6 \n" 149 "palignr $0x8,%%xmm5,%%xmm5 \n" 150 "palignr $0x8,%%xmm13,%%xmm13 \n" 151 "movdqu (%0,%3),%%xmm7 \n" 152 "lea (%0,%3,2),%0 \n" 153 "movdqa %%xmm6,%%xmm14 \n" 154 "punpcklbw %%xmm7,%%xmm6 \n" 155 "punpckhbw %%xmm7,%%xmm14 \n" 156 "neg %3 \n" 157 "movdqa %%xmm6,%%xmm7 \n" 158 "movdqa %%xmm14,%%xmm15 \n" 159 "lea 0x10(%0,%3,8),%0 \n" 160 "palignr $0x8,%%xmm7,%%xmm7 \n" 161 "palignr $0x8,%%xmm15,%%xmm15 \n" 162 "neg %3 \n" 163 // Second round of bit swap. 164 "punpcklwd %%xmm2,%%xmm0 \n" 165 "punpcklwd %%xmm3,%%xmm1 \n" 166 "movdqa %%xmm0,%%xmm2 \n" 167 "movdqa %%xmm1,%%xmm3 \n" 168 "palignr $0x8,%%xmm2,%%xmm2 \n" 169 "palignr $0x8,%%xmm3,%%xmm3 \n" 170 "punpcklwd %%xmm6,%%xmm4 \n" 171 "punpcklwd %%xmm7,%%xmm5 \n" 172 "movdqa %%xmm4,%%xmm6 \n" 173 "movdqa %%xmm5,%%xmm7 \n" 174 "palignr $0x8,%%xmm6,%%xmm6 \n" 175 "palignr $0x8,%%xmm7,%%xmm7 \n" 176 "punpcklwd %%xmm10,%%xmm8 \n" 177 "punpcklwd %%xmm11,%%xmm9 \n" 178 "movdqa %%xmm8,%%xmm10 \n" 179 "movdqa %%xmm9,%%xmm11 \n" 180 "palignr $0x8,%%xmm10,%%xmm10 \n" 181 "palignr $0x8,%%xmm11,%%xmm11 \n" 182 "punpcklwd %%xmm14,%%xmm12 \n" 183 "punpcklwd %%xmm15,%%xmm13 \n" 184 "movdqa %%xmm12,%%xmm14 \n" 185 "movdqa %%xmm13,%%xmm15 \n" 186 "palignr $0x8,%%xmm14,%%xmm14 \n" 187 "palignr $0x8,%%xmm15,%%xmm15 \n" 188 // Third round of bit swap. 189 // Write to the destination pointer. 190 "punpckldq %%xmm4,%%xmm0 \n" 191 "movq %%xmm0,(%1) \n" 192 "movdqa %%xmm0,%%xmm4 \n" 193 "palignr $0x8,%%xmm4,%%xmm4 \n" 194 "movq %%xmm4,(%1,%4) \n" 195 "lea (%1,%4,2),%1 \n" 196 "punpckldq %%xmm6,%%xmm2 \n" 197 "movdqa %%xmm2,%%xmm6 \n" 198 "movq %%xmm2,(%1) \n" 199 "palignr $0x8,%%xmm6,%%xmm6 \n" 200 "punpckldq %%xmm5,%%xmm1 \n" 201 "movq %%xmm6,(%1,%4) \n" 202 "lea (%1,%4,2),%1 \n" 203 "movdqa %%xmm1,%%xmm5 \n" 204 "movq %%xmm1,(%1) \n" 205 "palignr $0x8,%%xmm5,%%xmm5 \n" 206 "movq %%xmm5,(%1,%4) \n" 207 "lea (%1,%4,2),%1 \n" 208 "punpckldq %%xmm7,%%xmm3 \n" 209 "movq %%xmm3,(%1) \n" 210 "movdqa %%xmm3,%%xmm7 \n" 211 "palignr $0x8,%%xmm7,%%xmm7 \n" 212 "movq %%xmm7,(%1,%4) \n" 213 "lea (%1,%4,2),%1 \n" 214 "punpckldq %%xmm12,%%xmm8 \n" 215 "movq %%xmm8,(%1) \n" 216 "movdqa %%xmm8,%%xmm12 \n" 217 "palignr $0x8,%%xmm12,%%xmm12 \n" 218 "movq %%xmm12,(%1,%4) \n" 219 "lea (%1,%4,2),%1 \n" 220 "punpckldq %%xmm14,%%xmm10 \n" 221 "movdqa %%xmm10,%%xmm14 \n" 222 "movq %%xmm10,(%1) \n" 223 "palignr $0x8,%%xmm14,%%xmm14 \n" 224 "punpckldq %%xmm13,%%xmm9 \n" 225 "movq %%xmm14,(%1,%4) \n" 226 "lea (%1,%4,2),%1 \n" 227 "movdqa %%xmm9,%%xmm13 \n" 228 "movq %%xmm9,(%1) \n" 229 "palignr $0x8,%%xmm13,%%xmm13 \n" 230 "movq %%xmm13,(%1,%4) \n" 231 "lea (%1,%4,2),%1 \n" 232 "punpckldq %%xmm15,%%xmm11 \n" 233 "movq %%xmm11,(%1) \n" 234 "movdqa %%xmm11,%%xmm15 \n" 235 "palignr $0x8,%%xmm15,%%xmm15 \n" 236 "sub $0x10,%2 \n" 237 "movq %%xmm15,(%1,%4) \n" 238 "lea (%1,%4,2),%1 \n" 239 "jg 1b \n" 240 : "+r"(src), // %0 241 "+r"(dst), // %1 242 "+r"(width) // %2 243 : "r"((intptr_t)(src_stride)), // %3 244 "r"((intptr_t)(dst_stride)) // %4 245 : "memory", "cc", 246 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 247 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" 248 ); 249} 250#endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3) 251 252// Transpose UV 8x8. 64 bit. 253#if defined(HAS_TRANSPOSEUVWX8_SSE2) 254void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 255 uint8* dst_a, int dst_stride_a, 256 uint8* dst_b, int dst_stride_b, int width) { 257 asm volatile ( 258 // Read in the data from the source pointer. 259 // First round of bit swap. 260 LABELALIGN 261 "1: \n" 262 "movdqu (%0),%%xmm0 \n" 263 "movdqu (%0,%4),%%xmm1 \n" 264 "lea (%0,%4,2),%0 \n" 265 "movdqa %%xmm0,%%xmm8 \n" 266 "punpcklbw %%xmm1,%%xmm0 \n" 267 "punpckhbw %%xmm1,%%xmm8 \n" 268 "movdqa %%xmm8,%%xmm1 \n" 269 "movdqu (%0),%%xmm2 \n" 270 "movdqu (%0,%4),%%xmm3 \n" 271 "lea (%0,%4,2),%0 \n" 272 "movdqa %%xmm2,%%xmm8 \n" 273 "punpcklbw %%xmm3,%%xmm2 \n" 274 "punpckhbw %%xmm3,%%xmm8 \n" 275 "movdqa %%xmm8,%%xmm3 \n" 276 "movdqu (%0),%%xmm4 \n" 277 "movdqu (%0,%4),%%xmm5 \n" 278 "lea (%0,%4,2),%0 \n" 279 "movdqa %%xmm4,%%xmm8 \n" 280 "punpcklbw %%xmm5,%%xmm4 \n" 281 "punpckhbw %%xmm5,%%xmm8 \n" 282 "movdqa %%xmm8,%%xmm5 \n" 283 "movdqu (%0),%%xmm6 \n" 284 "movdqu (%0,%4),%%xmm7 \n" 285 "lea (%0,%4,2),%0 \n" 286 "movdqa %%xmm6,%%xmm8 \n" 287 "punpcklbw %%xmm7,%%xmm6 \n" 288 "neg %4 \n" 289 "lea 0x10(%0,%4,8),%0 \n" 290 "punpckhbw %%xmm7,%%xmm8 \n" 291 "movdqa %%xmm8,%%xmm7 \n" 292 "neg %4 \n" 293 // Second round of bit swap. 294 "movdqa %%xmm0,%%xmm8 \n" 295 "movdqa %%xmm1,%%xmm9 \n" 296 "punpckhwd %%xmm2,%%xmm8 \n" 297 "punpckhwd %%xmm3,%%xmm9 \n" 298 "punpcklwd %%xmm2,%%xmm0 \n" 299 "punpcklwd %%xmm3,%%xmm1 \n" 300 "movdqa %%xmm8,%%xmm2 \n" 301 "movdqa %%xmm9,%%xmm3 \n" 302 "movdqa %%xmm4,%%xmm8 \n" 303 "movdqa %%xmm5,%%xmm9 \n" 304 "punpckhwd %%xmm6,%%xmm8 \n" 305 "punpckhwd %%xmm7,%%xmm9 \n" 306 "punpcklwd %%xmm6,%%xmm4 \n" 307 "punpcklwd %%xmm7,%%xmm5 \n" 308 "movdqa %%xmm8,%%xmm6 \n" 309 "movdqa %%xmm9,%%xmm7 \n" 310 // Third round of bit swap. 311 // Write to the destination pointer. 312 "movdqa %%xmm0,%%xmm8 \n" 313 "punpckldq %%xmm4,%%xmm0 \n" 314 "movlpd %%xmm0,(%1) \n" // Write back U channel 315 "movhpd %%xmm0,(%2) \n" // Write back V channel 316 "punpckhdq %%xmm4,%%xmm8 \n" 317 "movlpd %%xmm8,(%1,%5) \n" 318 "lea (%1,%5,2),%1 \n" 319 "movhpd %%xmm8,(%2,%6) \n" 320 "lea (%2,%6,2),%2 \n" 321 "movdqa %%xmm2,%%xmm8 \n" 322 "punpckldq %%xmm6,%%xmm2 \n" 323 "movlpd %%xmm2,(%1) \n" 324 "movhpd %%xmm2,(%2) \n" 325 "punpckhdq %%xmm6,%%xmm8 \n" 326 "movlpd %%xmm8,(%1,%5) \n" 327 "lea (%1,%5,2),%1 \n" 328 "movhpd %%xmm8,(%2,%6) \n" 329 "lea (%2,%6,2),%2 \n" 330 "movdqa %%xmm1,%%xmm8 \n" 331 "punpckldq %%xmm5,%%xmm1 \n" 332 "movlpd %%xmm1,(%1) \n" 333 "movhpd %%xmm1,(%2) \n" 334 "punpckhdq %%xmm5,%%xmm8 \n" 335 "movlpd %%xmm8,(%1,%5) \n" 336 "lea (%1,%5,2),%1 \n" 337 "movhpd %%xmm8,(%2,%6) \n" 338 "lea (%2,%6,2),%2 \n" 339 "movdqa %%xmm3,%%xmm8 \n" 340 "punpckldq %%xmm7,%%xmm3 \n" 341 "movlpd %%xmm3,(%1) \n" 342 "movhpd %%xmm3,(%2) \n" 343 "punpckhdq %%xmm7,%%xmm8 \n" 344 "sub $0x8,%3 \n" 345 "movlpd %%xmm8,(%1,%5) \n" 346 "lea (%1,%5,2),%1 \n" 347 "movhpd %%xmm8,(%2,%6) \n" 348 "lea (%2,%6,2),%2 \n" 349 "jg 1b \n" 350 : "+r"(src), // %0 351 "+r"(dst_a), // %1 352 "+r"(dst_b), // %2 353 "+r"(width) // %3 354 : "r"((intptr_t)(src_stride)), // %4 355 "r"((intptr_t)(dst_stride_a)), // %5 356 "r"((intptr_t)(dst_stride_b)) // %6 357 : "memory", "cc", 358 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", 359 "xmm8", "xmm9" 360 ); 361} 362#endif // defined(HAS_TRANSPOSEUVWX8_SSE2) 363#endif // defined(__x86_64__) || defined(__i386__) 364 365#ifdef __cplusplus 366} // extern "C" 367} // namespace libyuv 368#endif 369