1/*
2 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12#include "libyuv/rotate_row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for 32 bit Visual C x86 and clangcl
20#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21
22__declspec(naked)
23void TransposeWx8_SSSE3(const uint8* src, int src_stride,
24                        uint8* dst, int dst_stride, int width) {
25  __asm {
26    push      edi
27    push      esi
28    push      ebp
29    mov       eax, [esp + 12 + 4]   // src
30    mov       edi, [esp + 12 + 8]   // src_stride
31    mov       edx, [esp + 12 + 12]  // dst
32    mov       esi, [esp + 12 + 16]  // dst_stride
33    mov       ecx, [esp + 12 + 20]  // width
34
35    // Read in the data from the source pointer.
36    // First round of bit swap.
37    align      4
38 convertloop:
39    movq      xmm0, qword ptr [eax]
40    lea       ebp, [eax + 8]
41    movq      xmm1, qword ptr [eax + edi]
42    lea       eax, [eax + 2 * edi]
43    punpcklbw xmm0, xmm1
44    movq      xmm2, qword ptr [eax]
45    movdqa    xmm1, xmm0
46    palignr   xmm1, xmm1, 8
47    movq      xmm3, qword ptr [eax + edi]
48    lea       eax, [eax + 2 * edi]
49    punpcklbw xmm2, xmm3
50    movdqa    xmm3, xmm2
51    movq      xmm4, qword ptr [eax]
52    palignr   xmm3, xmm3, 8
53    movq      xmm5, qword ptr [eax + edi]
54    punpcklbw xmm4, xmm5
55    lea       eax, [eax + 2 * edi]
56    movdqa    xmm5, xmm4
57    movq      xmm6, qword ptr [eax]
58    palignr   xmm5, xmm5, 8
59    movq      xmm7, qword ptr [eax + edi]
60    punpcklbw xmm6, xmm7
61    mov       eax, ebp
62    movdqa    xmm7, xmm6
63    palignr   xmm7, xmm7, 8
64    // Second round of bit swap.
65    punpcklwd xmm0, xmm2
66    punpcklwd xmm1, xmm3
67    movdqa    xmm2, xmm0
68    movdqa    xmm3, xmm1
69    palignr   xmm2, xmm2, 8
70    palignr   xmm3, xmm3, 8
71    punpcklwd xmm4, xmm6
72    punpcklwd xmm5, xmm7
73    movdqa    xmm6, xmm4
74    movdqa    xmm7, xmm5
75    palignr   xmm6, xmm6, 8
76    palignr   xmm7, xmm7, 8
77    // Third round of bit swap.
78    // Write to the destination pointer.
79    punpckldq xmm0, xmm4
80    movq      qword ptr [edx], xmm0
81    movdqa    xmm4, xmm0
82    palignr   xmm4, xmm4, 8
83    movq      qword ptr [edx + esi], xmm4
84    lea       edx, [edx + 2 * esi]
85    punpckldq xmm2, xmm6
86    movdqa    xmm6, xmm2
87    palignr   xmm6, xmm6, 8
88    movq      qword ptr [edx], xmm2
89    punpckldq xmm1, xmm5
90    movq      qword ptr [edx + esi], xmm6
91    lea       edx, [edx + 2 * esi]
92    movdqa    xmm5, xmm1
93    movq      qword ptr [edx], xmm1
94    palignr   xmm5, xmm5, 8
95    punpckldq xmm3, xmm7
96    movq      qword ptr [edx + esi], xmm5
97    lea       edx, [edx + 2 * esi]
98    movq      qword ptr [edx], xmm3
99    movdqa    xmm7, xmm3
100    palignr   xmm7, xmm7, 8
101    sub       ecx, 8
102    movq      qword ptr [edx + esi], xmm7
103    lea       edx, [edx + 2 * esi]
104    jg        convertloop
105
106    pop       ebp
107    pop       esi
108    pop       edi
109    ret
110  }
111}
112
113__declspec(naked)
114void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
115                         uint8* dst_a, int dst_stride_a,
116                         uint8* dst_b, int dst_stride_b,
117                         int w) {
118  __asm {
119    push      ebx
120    push      esi
121    push      edi
122    push      ebp
123    mov       eax, [esp + 16 + 4]   // src
124    mov       edi, [esp + 16 + 8]   // src_stride
125    mov       edx, [esp + 16 + 12]  // dst_a
126    mov       esi, [esp + 16 + 16]  // dst_stride_a
127    mov       ebx, [esp + 16 + 20]  // dst_b
128    mov       ebp, [esp + 16 + 24]  // dst_stride_b
129    mov       ecx, esp
130    sub       esp, 4 + 16
131    and       esp, ~15
132    mov       [esp + 16], ecx
133    mov       ecx, [ecx + 16 + 28]  // w
134
135    align      4
136 convertloop:
137    // Read in the data from the source pointer.
138    // First round of bit swap.
139    movdqu    xmm0, [eax]
140    movdqu    xmm1, [eax + edi]
141    lea       eax, [eax + 2 * edi]
142    movdqa    xmm7, xmm0  // use xmm7 as temp register.
143    punpcklbw xmm0, xmm1
144    punpckhbw xmm7, xmm1
145    movdqa    xmm1, xmm7
146    movdqu    xmm2, [eax]
147    movdqu    xmm3, [eax + edi]
148    lea       eax, [eax + 2 * edi]
149    movdqa    xmm7, xmm2
150    punpcklbw xmm2, xmm3
151    punpckhbw xmm7, xmm3
152    movdqa    xmm3, xmm7
153    movdqu    xmm4, [eax]
154    movdqu    xmm5, [eax + edi]
155    lea       eax, [eax + 2 * edi]
156    movdqa    xmm7, xmm4
157    punpcklbw xmm4, xmm5
158    punpckhbw xmm7, xmm5
159    movdqa    xmm5, xmm7
160    movdqu    xmm6, [eax]
161    movdqu    xmm7, [eax + edi]
162    lea       eax, [eax + 2 * edi]
163    movdqu    [esp], xmm5  // backup xmm5
164    neg       edi
165    movdqa    xmm5, xmm6   // use xmm5 as temp register.
166    punpcklbw xmm6, xmm7
167    punpckhbw xmm5, xmm7
168    movdqa    xmm7, xmm5
169    lea       eax, [eax + 8 * edi + 16]
170    neg       edi
171    // Second round of bit swap.
172    movdqa    xmm5, xmm0
173    punpcklwd xmm0, xmm2
174    punpckhwd xmm5, xmm2
175    movdqa    xmm2, xmm5
176    movdqa    xmm5, xmm1
177    punpcklwd xmm1, xmm3
178    punpckhwd xmm5, xmm3
179    movdqa    xmm3, xmm5
180    movdqa    xmm5, xmm4
181    punpcklwd xmm4, xmm6
182    punpckhwd xmm5, xmm6
183    movdqa    xmm6, xmm5
184    movdqu    xmm5, [esp]  // restore xmm5
185    movdqu    [esp], xmm6  // backup xmm6
186    movdqa    xmm6, xmm5    // use xmm6 as temp register.
187    punpcklwd xmm5, xmm7
188    punpckhwd xmm6, xmm7
189    movdqa    xmm7, xmm6
190    // Third round of bit swap.
191    // Write to the destination pointer.
192    movdqa    xmm6, xmm0
193    punpckldq xmm0, xmm4
194    punpckhdq xmm6, xmm4
195    movdqa    xmm4, xmm6
196    movdqu    xmm6, [esp]  // restore xmm6
197    movlpd    qword ptr [edx], xmm0
198    movhpd    qword ptr [ebx], xmm0
199    movlpd    qword ptr [edx + esi], xmm4
200    lea       edx, [edx + 2 * esi]
201    movhpd    qword ptr [ebx + ebp], xmm4
202    lea       ebx, [ebx + 2 * ebp]
203    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
204    punpckldq xmm2, xmm6
205    movlpd    qword ptr [edx], xmm2
206    movhpd    qword ptr [ebx], xmm2
207    punpckhdq xmm0, xmm6
208    movlpd    qword ptr [edx + esi], xmm0
209    lea       edx, [edx + 2 * esi]
210    movhpd    qword ptr [ebx + ebp], xmm0
211    lea       ebx, [ebx + 2 * ebp]
212    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
213    punpckldq xmm1, xmm5
214    movlpd    qword ptr [edx], xmm1
215    movhpd    qword ptr [ebx], xmm1
216    punpckhdq xmm0, xmm5
217    movlpd    qword ptr [edx + esi], xmm0
218    lea       edx, [edx + 2 * esi]
219    movhpd    qword ptr [ebx + ebp], xmm0
220    lea       ebx, [ebx + 2 * ebp]
221    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
222    punpckldq xmm3, xmm7
223    movlpd    qword ptr [edx], xmm3
224    movhpd    qword ptr [ebx], xmm3
225    punpckhdq xmm0, xmm7
226    sub       ecx, 8
227    movlpd    qword ptr [edx + esi], xmm0
228    lea       edx, [edx + 2 * esi]
229    movhpd    qword ptr [ebx + ebp], xmm0
230    lea       ebx, [ebx + 2 * ebp]
231    jg        convertloop
232
233    mov       esp, [esp + 16]
234    pop       ebp
235    pop       edi
236    pop       esi
237    pop       ebx
238    ret
239  }
240}
241
242#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
243
244#ifdef __cplusplus
245}  // extern "C"
246}  // namespace libyuv
247#endif
248