1/* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12#include "libyuv/rotate_row.h" 13 14#ifdef __cplusplus 15namespace libyuv { 16extern "C" { 17#endif 18 19// This module is for Visual C x86. 20#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ 21 defined(_MSC_VER) && !defined(__clang__) 22 23__declspec(naked) 24void TransposeWx8_SSSE3(const uint8* src, int src_stride, 25 uint8* dst, int dst_stride, int width) { 26 __asm { 27 push edi 28 push esi 29 push ebp 30 mov eax, [esp + 12 + 4] // src 31 mov edi, [esp + 12 + 8] // src_stride 32 mov edx, [esp + 12 + 12] // dst 33 mov esi, [esp + 12 + 16] // dst_stride 34 mov ecx, [esp + 12 + 20] // width 35 36 // Read in the data from the source pointer. 37 // First round of bit swap. 38 align 4 39 convertloop: 40 movq xmm0, qword ptr [eax] 41 lea ebp, [eax + 8] 42 movq xmm1, qword ptr [eax + edi] 43 lea eax, [eax + 2 * edi] 44 punpcklbw xmm0, xmm1 45 movq xmm2, qword ptr [eax] 46 movdqa xmm1, xmm0 47 palignr xmm1, xmm1, 8 48 movq xmm3, qword ptr [eax + edi] 49 lea eax, [eax + 2 * edi] 50 punpcklbw xmm2, xmm3 51 movdqa xmm3, xmm2 52 movq xmm4, qword ptr [eax] 53 palignr xmm3, xmm3, 8 54 movq xmm5, qword ptr [eax + edi] 55 punpcklbw xmm4, xmm5 56 lea eax, [eax + 2 * edi] 57 movdqa xmm5, xmm4 58 movq xmm6, qword ptr [eax] 59 palignr xmm5, xmm5, 8 60 movq xmm7, qword ptr [eax + edi] 61 punpcklbw xmm6, xmm7 62 mov eax, ebp 63 movdqa xmm7, xmm6 64 palignr xmm7, xmm7, 8 65 // Second round of bit swap. 66 punpcklwd xmm0, xmm2 67 punpcklwd xmm1, xmm3 68 movdqa xmm2, xmm0 69 movdqa xmm3, xmm1 70 palignr xmm2, xmm2, 8 71 palignr xmm3, xmm3, 8 72 punpcklwd xmm4, xmm6 73 punpcklwd xmm5, xmm7 74 movdqa xmm6, xmm4 75 movdqa xmm7, xmm5 76 palignr xmm6, xmm6, 8 77 palignr xmm7, xmm7, 8 78 // Third round of bit swap. 79 // Write to the destination pointer. 80 punpckldq xmm0, xmm4 81 movq qword ptr [edx], xmm0 82 movdqa xmm4, xmm0 83 palignr xmm4, xmm4, 8 84 movq qword ptr [edx + esi], xmm4 85 lea edx, [edx + 2 * esi] 86 punpckldq xmm2, xmm6 87 movdqa xmm6, xmm2 88 palignr xmm6, xmm6, 8 89 movq qword ptr [edx], xmm2 90 punpckldq xmm1, xmm5 91 movq qword ptr [edx + esi], xmm6 92 lea edx, [edx + 2 * esi] 93 movdqa xmm5, xmm1 94 movq qword ptr [edx], xmm1 95 palignr xmm5, xmm5, 8 96 punpckldq xmm3, xmm7 97 movq qword ptr [edx + esi], xmm5 98 lea edx, [edx + 2 * esi] 99 movq qword ptr [edx], xmm3 100 movdqa xmm7, xmm3 101 palignr xmm7, xmm7, 8 102 sub ecx, 8 103 movq qword ptr [edx + esi], xmm7 104 lea edx, [edx + 2 * esi] 105 jg convertloop 106 107 pop ebp 108 pop esi 109 pop edi 110 ret 111 } 112} 113 114__declspec(naked) 115void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 116 uint8* dst_a, int dst_stride_a, 117 uint8* dst_b, int dst_stride_b, 118 int w) { 119 __asm { 120 push ebx 121 push esi 122 push edi 123 push ebp 124 mov eax, [esp + 16 + 4] // src 125 mov edi, [esp + 16 + 8] // src_stride 126 mov edx, [esp + 16 + 12] // dst_a 127 mov esi, [esp + 16 + 16] // dst_stride_a 128 mov ebx, [esp + 16 + 20] // dst_b 129 mov ebp, [esp + 16 + 24] // dst_stride_b 130 mov ecx, esp 131 sub esp, 4 + 16 132 and esp, ~15 133 mov [esp + 16], ecx 134 mov ecx, [ecx + 16 + 28] // w 135 136 align 4 137 convertloop: 138 // Read in the data from the source pointer. 139 // First round of bit swap. 140 movdqu xmm0, [eax] 141 movdqu xmm1, [eax + edi] 142 lea eax, [eax + 2 * edi] 143 movdqa xmm7, xmm0 // use xmm7 as temp register. 144 punpcklbw xmm0, xmm1 145 punpckhbw xmm7, xmm1 146 movdqa xmm1, xmm7 147 movdqu xmm2, [eax] 148 movdqu xmm3, [eax + edi] 149 lea eax, [eax + 2 * edi] 150 movdqa xmm7, xmm2 151 punpcklbw xmm2, xmm3 152 punpckhbw xmm7, xmm3 153 movdqa xmm3, xmm7 154 movdqu xmm4, [eax] 155 movdqu xmm5, [eax + edi] 156 lea eax, [eax + 2 * edi] 157 movdqa xmm7, xmm4 158 punpcklbw xmm4, xmm5 159 punpckhbw xmm7, xmm5 160 movdqa xmm5, xmm7 161 movdqu xmm6, [eax] 162 movdqu xmm7, [eax + edi] 163 lea eax, [eax + 2 * edi] 164 movdqu [esp], xmm5 // backup xmm5 165 neg edi 166 movdqa xmm5, xmm6 // use xmm5 as temp register. 167 punpcklbw xmm6, xmm7 168 punpckhbw xmm5, xmm7 169 movdqa xmm7, xmm5 170 lea eax, [eax + 8 * edi + 16] 171 neg edi 172 // Second round of bit swap. 173 movdqa xmm5, xmm0 174 punpcklwd xmm0, xmm2 175 punpckhwd xmm5, xmm2 176 movdqa xmm2, xmm5 177 movdqa xmm5, xmm1 178 punpcklwd xmm1, xmm3 179 punpckhwd xmm5, xmm3 180 movdqa xmm3, xmm5 181 movdqa xmm5, xmm4 182 punpcklwd xmm4, xmm6 183 punpckhwd xmm5, xmm6 184 movdqa xmm6, xmm5 185 movdqu xmm5, [esp] // restore xmm5 186 movdqu [esp], xmm6 // backup xmm6 187 movdqa xmm6, xmm5 // use xmm6 as temp register. 188 punpcklwd xmm5, xmm7 189 punpckhwd xmm6, xmm7 190 movdqa xmm7, xmm6 191 // Third round of bit swap. 192 // Write to the destination pointer. 193 movdqa xmm6, xmm0 194 punpckldq xmm0, xmm4 195 punpckhdq xmm6, xmm4 196 movdqa xmm4, xmm6 197 movdqu xmm6, [esp] // restore xmm6 198 movlpd qword ptr [edx], xmm0 199 movhpd qword ptr [ebx], xmm0 200 movlpd qword ptr [edx + esi], xmm4 201 lea edx, [edx + 2 * esi] 202 movhpd qword ptr [ebx + ebp], xmm4 203 lea ebx, [ebx + 2 * ebp] 204 movdqa xmm0, xmm2 // use xmm0 as the temp register. 205 punpckldq xmm2, xmm6 206 movlpd qword ptr [edx], xmm2 207 movhpd qword ptr [ebx], xmm2 208 punpckhdq xmm0, xmm6 209 movlpd qword ptr [edx + esi], xmm0 210 lea edx, [edx + 2 * esi] 211 movhpd qword ptr [ebx + ebp], xmm0 212 lea ebx, [ebx + 2 * ebp] 213 movdqa xmm0, xmm1 // use xmm0 as the temp register. 214 punpckldq xmm1, xmm5 215 movlpd qword ptr [edx], xmm1 216 movhpd qword ptr [ebx], xmm1 217 punpckhdq xmm0, xmm5 218 movlpd qword ptr [edx + esi], xmm0 219 lea edx, [edx + 2 * esi] 220 movhpd qword ptr [ebx + ebp], xmm0 221 lea ebx, [ebx + 2 * ebp] 222 movdqa xmm0, xmm3 // use xmm0 as the temp register. 223 punpckldq xmm3, xmm7 224 movlpd qword ptr [edx], xmm3 225 movhpd qword ptr [ebx], xmm3 226 punpckhdq xmm0, xmm7 227 sub ecx, 8 228 movlpd qword ptr [edx + esi], xmm0 229 lea edx, [edx + 2 * esi] 230 movhpd qword ptr [ebx + ebp], xmm0 231 lea ebx, [ebx + 2 * ebp] 232 jg convertloop 233 234 mov esp, [esp + 16] 235 pop ebp 236 pop edi 237 pop esi 238 pop ebx 239 ret 240 } 241} 242 243#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 244 245#ifdef __cplusplus 246} // extern "C" 247} // namespace libyuv 248#endif 249