1/*
2 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12#include "libyuv/rotate_row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for Visual C x86.
20#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
21    defined(_MSC_VER) && !defined(__clang__)
22
23__declspec(naked)
24void TransposeWx8_SSSE3(const uint8* src, int src_stride,
25                        uint8* dst, int dst_stride, int width) {
26  __asm {
27    push      edi
28    push      esi
29    push      ebp
30    mov       eax, [esp + 12 + 4]   // src
31    mov       edi, [esp + 12 + 8]   // src_stride
32    mov       edx, [esp + 12 + 12]  // dst
33    mov       esi, [esp + 12 + 16]  // dst_stride
34    mov       ecx, [esp + 12 + 20]  // width
35
36    // Read in the data from the source pointer.
37    // First round of bit swap.
38    align      4
39 convertloop:
40    movq      xmm0, qword ptr [eax]
41    lea       ebp, [eax + 8]
42    movq      xmm1, qword ptr [eax + edi]
43    lea       eax, [eax + 2 * edi]
44    punpcklbw xmm0, xmm1
45    movq      xmm2, qword ptr [eax]
46    movdqa    xmm1, xmm0
47    palignr   xmm1, xmm1, 8
48    movq      xmm3, qword ptr [eax + edi]
49    lea       eax, [eax + 2 * edi]
50    punpcklbw xmm2, xmm3
51    movdqa    xmm3, xmm2
52    movq      xmm4, qword ptr [eax]
53    palignr   xmm3, xmm3, 8
54    movq      xmm5, qword ptr [eax + edi]
55    punpcklbw xmm4, xmm5
56    lea       eax, [eax + 2 * edi]
57    movdqa    xmm5, xmm4
58    movq      xmm6, qword ptr [eax]
59    palignr   xmm5, xmm5, 8
60    movq      xmm7, qword ptr [eax + edi]
61    punpcklbw xmm6, xmm7
62    mov       eax, ebp
63    movdqa    xmm7, xmm6
64    palignr   xmm7, xmm7, 8
65    // Second round of bit swap.
66    punpcklwd xmm0, xmm2
67    punpcklwd xmm1, xmm3
68    movdqa    xmm2, xmm0
69    movdqa    xmm3, xmm1
70    palignr   xmm2, xmm2, 8
71    palignr   xmm3, xmm3, 8
72    punpcklwd xmm4, xmm6
73    punpcklwd xmm5, xmm7
74    movdqa    xmm6, xmm4
75    movdqa    xmm7, xmm5
76    palignr   xmm6, xmm6, 8
77    palignr   xmm7, xmm7, 8
78    // Third round of bit swap.
79    // Write to the destination pointer.
80    punpckldq xmm0, xmm4
81    movq      qword ptr [edx], xmm0
82    movdqa    xmm4, xmm0
83    palignr   xmm4, xmm4, 8
84    movq      qword ptr [edx + esi], xmm4
85    lea       edx, [edx + 2 * esi]
86    punpckldq xmm2, xmm6
87    movdqa    xmm6, xmm2
88    palignr   xmm6, xmm6, 8
89    movq      qword ptr [edx], xmm2
90    punpckldq xmm1, xmm5
91    movq      qword ptr [edx + esi], xmm6
92    lea       edx, [edx + 2 * esi]
93    movdqa    xmm5, xmm1
94    movq      qword ptr [edx], xmm1
95    palignr   xmm5, xmm5, 8
96    punpckldq xmm3, xmm7
97    movq      qword ptr [edx + esi], xmm5
98    lea       edx, [edx + 2 * esi]
99    movq      qword ptr [edx], xmm3
100    movdqa    xmm7, xmm3
101    palignr   xmm7, xmm7, 8
102    sub       ecx, 8
103    movq      qword ptr [edx + esi], xmm7
104    lea       edx, [edx + 2 * esi]
105    jg        convertloop
106
107    pop       ebp
108    pop       esi
109    pop       edi
110    ret
111  }
112}
113
114__declspec(naked)
115void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
116                         uint8* dst_a, int dst_stride_a,
117                         uint8* dst_b, int dst_stride_b,
118                         int w) {
119  __asm {
120    push      ebx
121    push      esi
122    push      edi
123    push      ebp
124    mov       eax, [esp + 16 + 4]   // src
125    mov       edi, [esp + 16 + 8]   // src_stride
126    mov       edx, [esp + 16 + 12]  // dst_a
127    mov       esi, [esp + 16 + 16]  // dst_stride_a
128    mov       ebx, [esp + 16 + 20]  // dst_b
129    mov       ebp, [esp + 16 + 24]  // dst_stride_b
130    mov       ecx, esp
131    sub       esp, 4 + 16
132    and       esp, ~15
133    mov       [esp + 16], ecx
134    mov       ecx, [ecx + 16 + 28]  // w
135
136    align      4
137 convertloop:
138    // Read in the data from the source pointer.
139    // First round of bit swap.
140    movdqu    xmm0, [eax]
141    movdqu    xmm1, [eax + edi]
142    lea       eax, [eax + 2 * edi]
143    movdqa    xmm7, xmm0  // use xmm7 as temp register.
144    punpcklbw xmm0, xmm1
145    punpckhbw xmm7, xmm1
146    movdqa    xmm1, xmm7
147    movdqu    xmm2, [eax]
148    movdqu    xmm3, [eax + edi]
149    lea       eax, [eax + 2 * edi]
150    movdqa    xmm7, xmm2
151    punpcklbw xmm2, xmm3
152    punpckhbw xmm7, xmm3
153    movdqa    xmm3, xmm7
154    movdqu    xmm4, [eax]
155    movdqu    xmm5, [eax + edi]
156    lea       eax, [eax + 2 * edi]
157    movdqa    xmm7, xmm4
158    punpcklbw xmm4, xmm5
159    punpckhbw xmm7, xmm5
160    movdqa    xmm5, xmm7
161    movdqu    xmm6, [eax]
162    movdqu    xmm7, [eax + edi]
163    lea       eax, [eax + 2 * edi]
164    movdqu    [esp], xmm5  // backup xmm5
165    neg       edi
166    movdqa    xmm5, xmm6   // use xmm5 as temp register.
167    punpcklbw xmm6, xmm7
168    punpckhbw xmm5, xmm7
169    movdqa    xmm7, xmm5
170    lea       eax, [eax + 8 * edi + 16]
171    neg       edi
172    // Second round of bit swap.
173    movdqa    xmm5, xmm0
174    punpcklwd xmm0, xmm2
175    punpckhwd xmm5, xmm2
176    movdqa    xmm2, xmm5
177    movdqa    xmm5, xmm1
178    punpcklwd xmm1, xmm3
179    punpckhwd xmm5, xmm3
180    movdqa    xmm3, xmm5
181    movdqa    xmm5, xmm4
182    punpcklwd xmm4, xmm6
183    punpckhwd xmm5, xmm6
184    movdqa    xmm6, xmm5
185    movdqu    xmm5, [esp]  // restore xmm5
186    movdqu    [esp], xmm6  // backup xmm6
187    movdqa    xmm6, xmm5    // use xmm6 as temp register.
188    punpcklwd xmm5, xmm7
189    punpckhwd xmm6, xmm7
190    movdqa    xmm7, xmm6
191    // Third round of bit swap.
192    // Write to the destination pointer.
193    movdqa    xmm6, xmm0
194    punpckldq xmm0, xmm4
195    punpckhdq xmm6, xmm4
196    movdqa    xmm4, xmm6
197    movdqu    xmm6, [esp]  // restore xmm6
198    movlpd    qword ptr [edx], xmm0
199    movhpd    qword ptr [ebx], xmm0
200    movlpd    qword ptr [edx + esi], xmm4
201    lea       edx, [edx + 2 * esi]
202    movhpd    qword ptr [ebx + ebp], xmm4
203    lea       ebx, [ebx + 2 * ebp]
204    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
205    punpckldq xmm2, xmm6
206    movlpd    qword ptr [edx], xmm2
207    movhpd    qword ptr [ebx], xmm2
208    punpckhdq xmm0, xmm6
209    movlpd    qword ptr [edx + esi], xmm0
210    lea       edx, [edx + 2 * esi]
211    movhpd    qword ptr [ebx + ebp], xmm0
212    lea       ebx, [ebx + 2 * ebp]
213    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
214    punpckldq xmm1, xmm5
215    movlpd    qword ptr [edx], xmm1
216    movhpd    qword ptr [ebx], xmm1
217    punpckhdq xmm0, xmm5
218    movlpd    qword ptr [edx + esi], xmm0
219    lea       edx, [edx + 2 * esi]
220    movhpd    qword ptr [ebx + ebp], xmm0
221    lea       ebx, [ebx + 2 * ebp]
222    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
223    punpckldq xmm3, xmm7
224    movlpd    qword ptr [edx], xmm3
225    movhpd    qword ptr [ebx], xmm3
226    punpckhdq xmm0, xmm7
227    sub       ecx, 8
228    movlpd    qword ptr [edx + esi], xmm0
229    lea       edx, [edx + 2 * esi]
230    movhpd    qword ptr [ebx + ebp], xmm0
231    lea       ebx, [ebx + 2 * ebp]
232    jg        convertloop
233
234    mov       esp, [esp + 16]
235    pop       ebp
236    pop       edi
237    pop       esi
238    pop       ebx
239    ret
240  }
241}
242
243#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
244
245#ifdef __cplusplus
246}  // extern "C"
247}  // namespace libyuv
248#endif
249