1f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang/*
2f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
3f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *
4f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Use of this source code is governed by a BSD-style license
5f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  that can be found in the LICENSE file in the root of the source
6f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  tree. An additional intellectual property rights grant can be found
7f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  in the file PATENTS. All contributing project authors may
8f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  be found in the AUTHORS file in the root of the source tree.
9f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang */
10f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
11f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include "libyuv/rotate_row.h"
12cead1e07666bcc5914f8927712c2f89b9b789f9bFrank Barchard#include "libyuv/row.h"
13f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
14f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
15f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangnamespace libyuv {
16f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangextern "C" {
17f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
18f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
19f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// This module is for 32 bit Visual C x86 and clangcl
20f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
21f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
22b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard__declspec(naked) void TransposeWx8_SSSE3(const uint8* src,
23b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                          int src_stride,
24b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                          uint8* dst,
25b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                          int dst_stride,
26b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                          int width) {
27f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __asm {
28f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    push      edi
29f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    push      esi
30f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    push      ebp
31b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    mov       eax, [esp + 12 + 4]  // src
32b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    mov       edi, [esp + 12 + 8]  // src_stride
33f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       edx, [esp + 12 + 12]  // dst
34f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       esi, [esp + 12 + 16]  // dst_stride
35f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       ecx, [esp + 12 + 20]  // width
36f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
37f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Read in the data from the source pointer.
38f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // First round of bit swap.
39f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    align      4
40f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang convertloop:
41f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      xmm0, qword ptr [eax]
42f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       ebp, [eax + 8]
43f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      xmm1, qword ptr [eax + edi]
44f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       eax, [eax + 2 * edi]
45f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklbw xmm0, xmm1
46f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      xmm2, qword ptr [eax]
47f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm1, xmm0
48f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm1, xmm1, 8
49f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      xmm3, qword ptr [eax + edi]
50f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       eax, [eax + 2 * edi]
51f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklbw xmm2, xmm3
52f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm3, xmm2
53f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      xmm4, qword ptr [eax]
54f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm3, xmm3, 8
55f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      xmm5, qword ptr [eax + edi]
56f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklbw xmm4, xmm5
57f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       eax, [eax + 2 * edi]
58f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm5, xmm4
59f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      xmm6, qword ptr [eax]
60f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm5, xmm5, 8
61f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      xmm7, qword ptr [eax + edi]
62f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklbw xmm6, xmm7
63f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       eax, ebp
64f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm7, xmm6
65f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm7, xmm7, 8
66f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Second round of bit swap.
67f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklwd xmm0, xmm2
68f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklwd xmm1, xmm3
69f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm2, xmm0
70f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm3, xmm1
71f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm2, xmm2, 8
72f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm3, xmm3, 8
73f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklwd xmm4, xmm6
74f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklwd xmm5, xmm7
75f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm6, xmm4
76f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm7, xmm5
77f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm6, xmm6, 8
78f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm7, xmm7, 8
79f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Third round of bit swap.
80f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Write to the destination pointer.
81f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckldq xmm0, xmm4
82f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      qword ptr [edx], xmm0
83f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm4, xmm0
84f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm4, xmm4, 8
85f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      qword ptr [edx + esi], xmm4
86f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       edx, [edx + 2 * esi]
87f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckldq xmm2, xmm6
88f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm6, xmm2
89f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm6, xmm6, 8
90f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      qword ptr [edx], xmm2
91f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckldq xmm1, xmm5
92f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      qword ptr [edx + esi], xmm6
93f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       edx, [edx + 2 * esi]
94f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm5, xmm1
95f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      qword ptr [edx], xmm1
96f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm5, xmm5, 8
97f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckldq xmm3, xmm7
98f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      qword ptr [edx + esi], xmm5
99f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       edx, [edx + 2 * esi]
100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      qword ptr [edx], xmm3
101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm7, xmm3
102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    palignr   xmm7, xmm7, 8
103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    sub       ecx, 8
104f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movq      qword ptr [edx + esi], xmm7
105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       edx, [edx + 2 * esi]
106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    jg        convertloop
107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    pop       ebp
109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    pop       esi
110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    pop       edi
111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    ret
112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
115b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard__declspec(naked) void TransposeUVWx8_SSE2(const uint8* src,
116b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                           int src_stride,
117b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                           uint8* dst_a,
118b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                           int dst_stride_a,
119b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                           uint8* dst_b,
120b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                           int dst_stride_b,
121b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                           int w) {
122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  __asm {
123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    push      ebx
124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    push      esi
125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    push      edi
126f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    push      ebp
127b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    mov       eax, [esp + 16 + 4]  // src
128b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    mov       edi, [esp + 16 + 8]  // src_stride
129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       edx, [esp + 16 + 12]  // dst_a
130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       esi, [esp + 16 + 16]  // dst_stride_a
131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       ebx, [esp + 16 + 20]  // dst_b
132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       ebp, [esp + 16 + 24]  // dst_stride_b
133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       ecx, esp
134f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    sub       esp, 4 + 16
135f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    and       esp, ~15
136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       [esp + 16], ecx
137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       ecx, [ecx + 16 + 28]  // w
138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    align      4
140b33a82ffd059f362574ae038458e8dee26ac5a4aFrank Barchard    // Read in the data from the source pointer.
141b33a82ffd059f362574ae038458e8dee26ac5a4aFrank Barchard    // First round of bit swap.
142b33a82ffd059f362574ae038458e8dee26ac5a4aFrank Barchard  convertloop:
143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm0, [eax]
144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm1, [eax + edi]
145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       eax, [eax + 2 * edi]
146f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm7, xmm0  // use xmm7 as temp register.
147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklbw xmm0, xmm1
148f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhbw xmm7, xmm1
149f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm1, xmm7
150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm2, [eax]
151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm3, [eax + edi]
152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       eax, [eax + 2 * edi]
153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm7, xmm2
154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklbw xmm2, xmm3
155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhbw xmm7, xmm3
156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm3, xmm7
157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm4, [eax]
158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm5, [eax + edi]
159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       eax, [eax + 2 * edi]
160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm7, xmm4
161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklbw xmm4, xmm5
162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhbw xmm7, xmm5
163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm5, xmm7
164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm6, [eax]
165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm7, [eax + edi]
166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       eax, [eax + 2 * edi]
167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    [esp], xmm5  // backup xmm5
168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    neg       edi
169b33a82ffd059f362574ae038458e8dee26ac5a4aFrank Barchard    movdqa    xmm5, xmm6  // use xmm5 as temp register.
170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklbw xmm6, xmm7
171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhbw xmm5, xmm7
172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm7, xmm5
173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       eax, [eax + 8 * edi + 16]
174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    neg       edi
175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Second round of bit swap.
176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm5, xmm0
177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklwd xmm0, xmm2
178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhwd xmm5, xmm2
179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm2, xmm5
180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm5, xmm1
181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklwd xmm1, xmm3
182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhwd xmm5, xmm3
183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm3, xmm5
184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm5, xmm4
185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklwd xmm4, xmm6
186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhwd xmm5, xmm6
187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm6, xmm5
188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm5, [esp]  // restore xmm5
189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    [esp], xmm6  // backup xmm6
190b33a82ffd059f362574ae038458e8dee26ac5a4aFrank Barchard    movdqa    xmm6, xmm5  // use xmm6 as temp register.
191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpcklwd xmm5, xmm7
192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhwd xmm6, xmm7
193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm7, xmm6
194b33a82ffd059f362574ae038458e8dee26ac5a4aFrank Barchard
195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Third round of bit swap.
196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Write to the destination pointer.
197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm6, xmm0
198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckldq xmm0, xmm4
199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhdq xmm6, xmm4
200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqa    xmm4, xmm6
201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movdqu    xmm6, [esp]  // restore xmm6
202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movlpd    qword ptr [edx], xmm0
203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movhpd    qword ptr [ebx], xmm0
204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movlpd    qword ptr [edx + esi], xmm4
205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       edx, [edx + 2 * esi]
206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movhpd    qword ptr [ebx + ebp], xmm4
207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       ebx, [ebx + 2 * ebp]
208b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckldq xmm2, xmm6
210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movlpd    qword ptr [edx], xmm2
211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movhpd    qword ptr [ebx], xmm2
212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhdq xmm0, xmm6
213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movlpd    qword ptr [edx + esi], xmm0
214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       edx, [edx + 2 * esi]
215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movhpd    qword ptr [ebx + ebp], xmm0
216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       ebx, [ebx + 2 * ebp]
217b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckldq xmm1, xmm5
219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movlpd    qword ptr [edx], xmm1
220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movhpd    qword ptr [ebx], xmm1
221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhdq xmm0, xmm5
222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movlpd    qword ptr [edx + esi], xmm0
223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       edx, [edx + 2 * esi]
224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movhpd    qword ptr [ebx + ebp], xmm0
225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       ebx, [ebx + 2 * ebp]
226b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckldq xmm3, xmm7
228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movlpd    qword ptr [edx], xmm3
229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movhpd    qword ptr [ebx], xmm3
230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    punpckhdq xmm0, xmm7
231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    sub       ecx, 8
232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movlpd    qword ptr [edx + esi], xmm0
233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       edx, [edx + 2 * esi]
234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    movhpd    qword ptr [ebx + ebp], xmm0
235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    lea       ebx, [ebx + 2 * ebp]
236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    jg        convertloop
237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    mov       esp, [esp + 16]
239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    pop       ebp
240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    pop       edi
241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    pop       esi
242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    pop       ebx
243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    ret
244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  }
245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}  // extern "C"
251f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}  // namespace libyuv
252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
253