1/* 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11#include "libyuv/row.h" 12#include "libyuv/rotate_row.h" 13 14#ifdef __cplusplus 15namespace libyuv { 16extern "C" { 17#endif 18 19// This module is for 32 bit Visual C x86 and clangcl 20#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 21 22__declspec(naked) 23void TransposeWx8_SSSE3(const uint8* src, int src_stride, 24 uint8* dst, int dst_stride, int width) { 25 __asm { 26 push edi 27 push esi 28 push ebp 29 mov eax, [esp + 12 + 4] // src 30 mov edi, [esp + 12 + 8] // src_stride 31 mov edx, [esp + 12 + 12] // dst 32 mov esi, [esp + 12 + 16] // dst_stride 33 mov ecx, [esp + 12 + 20] // width 34 35 // Read in the data from the source pointer. 36 // First round of bit swap. 37 align 4 38 convertloop: 39 movq xmm0, qword ptr [eax] 40 lea ebp, [eax + 8] 41 movq xmm1, qword ptr [eax + edi] 42 lea eax, [eax + 2 * edi] 43 punpcklbw xmm0, xmm1 44 movq xmm2, qword ptr [eax] 45 movdqa xmm1, xmm0 46 palignr xmm1, xmm1, 8 47 movq xmm3, qword ptr [eax + edi] 48 lea eax, [eax + 2 * edi] 49 punpcklbw xmm2, xmm3 50 movdqa xmm3, xmm2 51 movq xmm4, qword ptr [eax] 52 palignr xmm3, xmm3, 8 53 movq xmm5, qword ptr [eax + edi] 54 punpcklbw xmm4, xmm5 55 lea eax, [eax + 2 * edi] 56 movdqa xmm5, xmm4 57 movq xmm6, qword ptr [eax] 58 palignr xmm5, xmm5, 8 59 movq xmm7, qword ptr [eax + edi] 60 punpcklbw xmm6, xmm7 61 mov eax, ebp 62 movdqa xmm7, xmm6 63 palignr xmm7, xmm7, 8 64 // Second round of bit swap. 65 punpcklwd xmm0, xmm2 66 punpcklwd xmm1, xmm3 67 movdqa xmm2, xmm0 68 movdqa xmm3, xmm1 69 palignr xmm2, xmm2, 8 70 palignr xmm3, xmm3, 8 71 punpcklwd xmm4, xmm6 72 punpcklwd xmm5, xmm7 73 movdqa xmm6, xmm4 74 movdqa xmm7, xmm5 75 palignr xmm6, xmm6, 8 76 palignr xmm7, xmm7, 8 77 // Third round of bit swap. 78 // Write to the destination pointer. 79 punpckldq xmm0, xmm4 80 movq qword ptr [edx], xmm0 81 movdqa xmm4, xmm0 82 palignr xmm4, xmm4, 8 83 movq qword ptr [edx + esi], xmm4 84 lea edx, [edx + 2 * esi] 85 punpckldq xmm2, xmm6 86 movdqa xmm6, xmm2 87 palignr xmm6, xmm6, 8 88 movq qword ptr [edx], xmm2 89 punpckldq xmm1, xmm5 90 movq qword ptr [edx + esi], xmm6 91 lea edx, [edx + 2 * esi] 92 movdqa xmm5, xmm1 93 movq qword ptr [edx], xmm1 94 palignr xmm5, xmm5, 8 95 punpckldq xmm3, xmm7 96 movq qword ptr [edx + esi], xmm5 97 lea edx, [edx + 2 * esi] 98 movq qword ptr [edx], xmm3 99 movdqa xmm7, xmm3 100 palignr xmm7, xmm7, 8 101 sub ecx, 8 102 movq qword ptr [edx + esi], xmm7 103 lea edx, [edx + 2 * esi] 104 jg convertloop 105 106 pop ebp 107 pop esi 108 pop edi 109 ret 110 } 111} 112 113__declspec(naked) 114void TransposeUVWx8_SSE2(const uint8* src, int src_stride, 115 uint8* dst_a, int dst_stride_a, 116 uint8* dst_b, int dst_stride_b, 117 int w) { 118 __asm { 119 push ebx 120 push esi 121 push edi 122 push ebp 123 mov eax, [esp + 16 + 4] // src 124 mov edi, [esp + 16 + 8] // src_stride 125 mov edx, [esp + 16 + 12] // dst_a 126 mov esi, [esp + 16 + 16] // dst_stride_a 127 mov ebx, [esp + 16 + 20] // dst_b 128 mov ebp, [esp + 16 + 24] // dst_stride_b 129 mov ecx, esp 130 sub esp, 4 + 16 131 and esp, ~15 132 mov [esp + 16], ecx 133 mov ecx, [ecx + 16 + 28] // w 134 135 align 4 136 convertloop: 137 // Read in the data from the source pointer. 138 // First round of bit swap. 139 movdqu xmm0, [eax] 140 movdqu xmm1, [eax + edi] 141 lea eax, [eax + 2 * edi] 142 movdqa xmm7, xmm0 // use xmm7 as temp register. 143 punpcklbw xmm0, xmm1 144 punpckhbw xmm7, xmm1 145 movdqa xmm1, xmm7 146 movdqu xmm2, [eax] 147 movdqu xmm3, [eax + edi] 148 lea eax, [eax + 2 * edi] 149 movdqa xmm7, xmm2 150 punpcklbw xmm2, xmm3 151 punpckhbw xmm7, xmm3 152 movdqa xmm3, xmm7 153 movdqu xmm4, [eax] 154 movdqu xmm5, [eax + edi] 155 lea eax, [eax + 2 * edi] 156 movdqa xmm7, xmm4 157 punpcklbw xmm4, xmm5 158 punpckhbw xmm7, xmm5 159 movdqa xmm5, xmm7 160 movdqu xmm6, [eax] 161 movdqu xmm7, [eax + edi] 162 lea eax, [eax + 2 * edi] 163 movdqu [esp], xmm5 // backup xmm5 164 neg edi 165 movdqa xmm5, xmm6 // use xmm5 as temp register. 166 punpcklbw xmm6, xmm7 167 punpckhbw xmm5, xmm7 168 movdqa xmm7, xmm5 169 lea eax, [eax + 8 * edi + 16] 170 neg edi 171 // Second round of bit swap. 172 movdqa xmm5, xmm0 173 punpcklwd xmm0, xmm2 174 punpckhwd xmm5, xmm2 175 movdqa xmm2, xmm5 176 movdqa xmm5, xmm1 177 punpcklwd xmm1, xmm3 178 punpckhwd xmm5, xmm3 179 movdqa xmm3, xmm5 180 movdqa xmm5, xmm4 181 punpcklwd xmm4, xmm6 182 punpckhwd xmm5, xmm6 183 movdqa xmm6, xmm5 184 movdqu xmm5, [esp] // restore xmm5 185 movdqu [esp], xmm6 // backup xmm6 186 movdqa xmm6, xmm5 // use xmm6 as temp register. 187 punpcklwd xmm5, xmm7 188 punpckhwd xmm6, xmm7 189 movdqa xmm7, xmm6 190 // Third round of bit swap. 191 // Write to the destination pointer. 192 movdqa xmm6, xmm0 193 punpckldq xmm0, xmm4 194 punpckhdq xmm6, xmm4 195 movdqa xmm4, xmm6 196 movdqu xmm6, [esp] // restore xmm6 197 movlpd qword ptr [edx], xmm0 198 movhpd qword ptr [ebx], xmm0 199 movlpd qword ptr [edx + esi], xmm4 200 lea edx, [edx + 2 * esi] 201 movhpd qword ptr [ebx + ebp], xmm4 202 lea ebx, [ebx + 2 * ebp] 203 movdqa xmm0, xmm2 // use xmm0 as the temp register. 204 punpckldq xmm2, xmm6 205 movlpd qword ptr [edx], xmm2 206 movhpd qword ptr [ebx], xmm2 207 punpckhdq xmm0, xmm6 208 movlpd qword ptr [edx + esi], xmm0 209 lea edx, [edx + 2 * esi] 210 movhpd qword ptr [ebx + ebp], xmm0 211 lea ebx, [ebx + 2 * ebp] 212 movdqa xmm0, xmm1 // use xmm0 as the temp register. 213 punpckldq xmm1, xmm5 214 movlpd qword ptr [edx], xmm1 215 movhpd qword ptr [ebx], xmm1 216 punpckhdq xmm0, xmm5 217 movlpd qword ptr [edx + esi], xmm0 218 lea edx, [edx + 2 * esi] 219 movhpd qword ptr [ebx + ebp], xmm0 220 lea ebx, [ebx + 2 * ebp] 221 movdqa xmm0, xmm3 // use xmm0 as the temp register. 222 punpckldq xmm3, xmm7 223 movlpd qword ptr [edx], xmm3 224 movhpd qword ptr [ebx], xmm3 225 punpckhdq xmm0, xmm7 226 sub ecx, 8 227 movlpd qword ptr [edx + esi], xmm0 228 lea edx, [edx + 2 * esi] 229 movhpd qword ptr [ebx + ebp], xmm0 230 lea ebx, [ebx + 2 * ebp] 231 jg convertloop 232 233 mov esp, [esp + 16] 234 pop ebp 235 pop edi 236 pop esi 237 pop ebx 238 ret 239 } 240} 241 242#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) 243 244#ifdef __cplusplus 245} // extern "C" 246} // namespace libyuv 247#endif 248