1/*
2 *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/rotate_row.h"
12#include "libyuv/row.h"
13
14#ifdef __cplusplus
15namespace libyuv {
16extern "C" {
17#endif
18
19// This module is for 32 bit Visual C x86 and clangcl
20#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21
22__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
23                                          int src_stride,
24                                          uint8* dst,
25                                          int dst_stride,
26                                          int width) {
27  __asm {
28    push      edi
29    push      esi
30    push      ebp
31    mov       eax, [esp + 12 + 4]  // src
32    mov       edi, [esp + 12 + 8]  // src_stride
33    mov       edx, [esp + 12 + 12]  // dst
34    mov       esi, [esp + 12 + 16]  // dst_stride
35    mov       ecx, [esp + 12 + 20]  // width
36
37    // Read in the data from the source pointer.
38    // First round of bit swap.
39    align      4
40 convertloop:
41    movq      xmm0, qword ptr [eax]
42    lea       ebp, [eax + 8]
43    movq      xmm1, qword ptr [eax + edi]
44    lea       eax, [eax + 2 * edi]
45    punpcklbw xmm0, xmm1
46    movq      xmm2, qword ptr [eax]
47    movdqa    xmm1, xmm0
48    palignr   xmm1, xmm1, 8
49    movq      xmm3, qword ptr [eax + edi]
50    lea       eax, [eax + 2 * edi]
51    punpcklbw xmm2, xmm3
52    movdqa    xmm3, xmm2
53    movq      xmm4, qword ptr [eax]
54    palignr   xmm3, xmm3, 8
55    movq      xmm5, qword ptr [eax + edi]
56    punpcklbw xmm4, xmm5
57    lea       eax, [eax + 2 * edi]
58    movdqa    xmm5, xmm4
59    movq      xmm6, qword ptr [eax]
60    palignr   xmm5, xmm5, 8
61    movq      xmm7, qword ptr [eax + edi]
62    punpcklbw xmm6, xmm7
63    mov       eax, ebp
64    movdqa    xmm7, xmm6
65    palignr   xmm7, xmm7, 8
66    // Second round of bit swap.
67    punpcklwd xmm0, xmm2
68    punpcklwd xmm1, xmm3
69    movdqa    xmm2, xmm0
70    movdqa    xmm3, xmm1
71    palignr   xmm2, xmm2, 8
72    palignr   xmm3, xmm3, 8
73    punpcklwd xmm4, xmm6
74    punpcklwd xmm5, xmm7
75    movdqa    xmm6, xmm4
76    movdqa    xmm7, xmm5
77    palignr   xmm6, xmm6, 8
78    palignr   xmm7, xmm7, 8
79    // Third round of bit swap.
80    // Write to the destination pointer.
81    punpckldq xmm0, xmm4
82    movq      qword ptr [edx], xmm0
83    movdqa    xmm4, xmm0
84    palignr   xmm4, xmm4, 8
85    movq      qword ptr [edx + esi], xmm4
86    lea       edx, [edx + 2 * esi]
87    punpckldq xmm2, xmm6
88    movdqa    xmm6, xmm2
89    palignr   xmm6, xmm6, 8
90    movq      qword ptr [edx], xmm2
91    punpckldq xmm1, xmm5
92    movq      qword ptr [edx + esi], xmm6
93    lea       edx, [edx + 2 * esi]
94    movdqa    xmm5, xmm1
95    movq      qword ptr [edx], xmm1
96    palignr   xmm5, xmm5, 8
97    punpckldq xmm3, xmm7
98    movq      qword ptr [edx + esi], xmm5
99    lea       edx, [edx + 2 * esi]
100    movq      qword ptr [edx], xmm3
101    movdqa    xmm7, xmm3
102    palignr   xmm7, xmm7, 8
103    sub       ecx, 8
104    movq      qword ptr [edx + esi], xmm7
105    lea       edx, [edx + 2 * esi]
106    jg        convertloop
107
108    pop       ebp
109    pop       esi
110    pop       edi
111    ret
112  }
113}
114
115__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
116                                           int src_stride,
117                                           uint8* dst_a,
118                                           int dst_stride_a,
119                                           uint8* dst_b,
120                                           int dst_stride_b,
121                                           int w) {
122  __asm {
123    push      ebx
124    push      esi
125    push      edi
126    push      ebp
127    mov       eax, [esp + 16 + 4]  // src
128    mov       edi, [esp + 16 + 8]  // src_stride
129    mov       edx, [esp + 16 + 12]  // dst_a
130    mov       esi, [esp + 16 + 16]  // dst_stride_a
131    mov       ebx, [esp + 16 + 20]  // dst_b
132    mov       ebp, [esp + 16 + 24]  // dst_stride_b
133    mov       ecx, esp
134    sub       esp, 4 + 16
135    and       esp, ~15
136    mov       [esp + 16], ecx
137    mov       ecx, [ecx + 16 + 28]  // w
138
139    align      4
140    // Read in the data from the source pointer.
141    // First round of bit swap.
142  convertloop:
143    movdqu    xmm0, [eax]
144    movdqu    xmm1, [eax + edi]
145    lea       eax, [eax + 2 * edi]
146    movdqa    xmm7, xmm0  // use xmm7 as temp register.
147    punpcklbw xmm0, xmm1
148    punpckhbw xmm7, xmm1
149    movdqa    xmm1, xmm7
150    movdqu    xmm2, [eax]
151    movdqu    xmm3, [eax + edi]
152    lea       eax, [eax + 2 * edi]
153    movdqa    xmm7, xmm2
154    punpcklbw xmm2, xmm3
155    punpckhbw xmm7, xmm3
156    movdqa    xmm3, xmm7
157    movdqu    xmm4, [eax]
158    movdqu    xmm5, [eax + edi]
159    lea       eax, [eax + 2 * edi]
160    movdqa    xmm7, xmm4
161    punpcklbw xmm4, xmm5
162    punpckhbw xmm7, xmm5
163    movdqa    xmm5, xmm7
164    movdqu    xmm6, [eax]
165    movdqu    xmm7, [eax + edi]
166    lea       eax, [eax + 2 * edi]
167    movdqu    [esp], xmm5  // backup xmm5
168    neg       edi
169    movdqa    xmm5, xmm6  // use xmm5 as temp register.
170    punpcklbw xmm6, xmm7
171    punpckhbw xmm5, xmm7
172    movdqa    xmm7, xmm5
173    lea       eax, [eax + 8 * edi + 16]
174    neg       edi
175    // Second round of bit swap.
176    movdqa    xmm5, xmm0
177    punpcklwd xmm0, xmm2
178    punpckhwd xmm5, xmm2
179    movdqa    xmm2, xmm5
180    movdqa    xmm5, xmm1
181    punpcklwd xmm1, xmm3
182    punpckhwd xmm5, xmm3
183    movdqa    xmm3, xmm5
184    movdqa    xmm5, xmm4
185    punpcklwd xmm4, xmm6
186    punpckhwd xmm5, xmm6
187    movdqa    xmm6, xmm5
188    movdqu    xmm5, [esp]  // restore xmm5
189    movdqu    [esp], xmm6  // backup xmm6
190    movdqa    xmm6, xmm5  // use xmm6 as temp register.
191    punpcklwd xmm5, xmm7
192    punpckhwd xmm6, xmm7
193    movdqa    xmm7, xmm6
194
195    // Third round of bit swap.
196    // Write to the destination pointer.
197    movdqa    xmm6, xmm0
198    punpckldq xmm0, xmm4
199    punpckhdq xmm6, xmm4
200    movdqa    xmm4, xmm6
201    movdqu    xmm6, [esp]  // restore xmm6
202    movlpd    qword ptr [edx], xmm0
203    movhpd    qword ptr [ebx], xmm0
204    movlpd    qword ptr [edx + esi], xmm4
205    lea       edx, [edx + 2 * esi]
206    movhpd    qword ptr [ebx + ebp], xmm4
207    lea       ebx, [ebx + 2 * ebp]
208    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
209    punpckldq xmm2, xmm6
210    movlpd    qword ptr [edx], xmm2
211    movhpd    qword ptr [ebx], xmm2
212    punpckhdq xmm0, xmm6
213    movlpd    qword ptr [edx + esi], xmm0
214    lea       edx, [edx + 2 * esi]
215    movhpd    qword ptr [ebx + ebp], xmm0
216    lea       ebx, [ebx + 2 * ebp]
217    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
218    punpckldq xmm1, xmm5
219    movlpd    qword ptr [edx], xmm1
220    movhpd    qword ptr [ebx], xmm1
221    punpckhdq xmm0, xmm5
222    movlpd    qword ptr [edx + esi], xmm0
223    lea       edx, [edx + 2 * esi]
224    movhpd    qword ptr [ebx + ebp], xmm0
225    lea       ebx, [ebx + 2 * ebp]
226    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
227    punpckldq xmm3, xmm7
228    movlpd    qword ptr [edx], xmm3
229    movhpd    qword ptr [ebx], xmm3
230    punpckhdq xmm0, xmm7
231    sub       ecx, 8
232    movlpd    qword ptr [edx + esi], xmm0
233    lea       edx, [edx + 2 * esi]
234    movhpd    qword ptr [ebx + ebp], xmm0
235    lea       ebx, [ebx + 2 * ebp]
236    jg        convertloop
237
238    mov       esp, [esp + 16]
239    pop       ebp
240    pop       edi
241    pop       esi
242    pop       ebx
243    ret
244  }
245}
246
247#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
248
249#ifdef __cplusplus
250}  // extern "C"
251}  // namespace libyuv
252#endif
253