highbd_sad4d_sse2.asm revision 7ce0a1d1337c01056ba24006efab21f00e179e04
1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
16%macro HIGH_PROCESS_4x2x4 5-6 0
17  movh                  m0, [srcq +%2*2]
18%if %1 == 1
19  movu                  m4, [ref1q+%3*2]
20  movu                  m5, [ref2q+%3*2]
21  movu                  m6, [ref3q+%3*2]
22  movu                  m7, [ref4q+%3*2]
23  movhps                m0, [srcq +%4*2]
24  movhps                m4, [ref1q+%5*2]
25  movhps                m5, [ref2q+%5*2]
26  movhps                m6, [ref3q+%5*2]
27  movhps                m7, [ref4q+%5*2]
28  mova                  m3, m0
29  mova                  m2, m0
30  psubusw               m3, m4
31  psubusw               m2, m5
32  psubusw               m4, m0
33  psubusw               m5, m0
34  por                   m4, m3
35  por                   m5, m2
36  pmaddwd               m4, m1
37  pmaddwd               m5, m1
38  mova                  m3, m0
39  mova                  m2, m0
40  psubusw               m3, m6
41  psubusw               m2, m7
42  psubusw               m6, m0
43  psubusw               m7, m0
44  por                   m6, m3
45  por                   m7, m2
46  pmaddwd               m6, m1
47  pmaddwd               m7, m1
48%else
49  movu                  m2, [ref1q+%3*2]
50  movhps                m0, [srcq +%4*2]
51  movhps                m2, [ref1q+%5*2]
52  mova                  m3, m0
53  psubusw               m3, m2
54  psubusw               m2, m0
55  por                   m2, m3
56  pmaddwd               m2, m1
57  paddd                 m4, m2
58
59  movu                  m2, [ref2q+%3*2]
60  mova                  m3, m0
61  movhps                m2, [ref2q+%5*2]
62  psubusw               m3, m2
63  psubusw               m2, m0
64  por                   m2, m3
65  pmaddwd               m2, m1
66  paddd                 m5, m2
67
68  movu                  m2, [ref3q+%3*2]
69  mova                  m3, m0
70  movhps                m2, [ref3q+%5*2]
71  psubusw               m3, m2
72  psubusw               m2, m0
73  por                   m2, m3
74  pmaddwd               m2, m1
75  paddd                 m6, m2
76
77  movu                  m2, [ref4q+%3*2]
78  mova                  m3, m0
79  movhps                m2, [ref4q+%5*2]
80  psubusw               m3, m2
81  psubusw               m2, m0
82  por                   m2, m3
83  pmaddwd               m2, m1
84  paddd                 m7, m2
85%endif
86%if %6 == 1
87  lea                 srcq, [srcq +src_strideq*4]
88  lea                ref1q, [ref1q+ref_strideq*4]
89  lea                ref2q, [ref2q+ref_strideq*4]
90  lea                ref3q, [ref3q+ref_strideq*4]
91  lea                ref4q, [ref4q+ref_strideq*4]
92%endif
93%endmacro
94
95; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
96%macro HIGH_PROCESS_8x2x4 5-6 0
97  ; 1st 8 px
98  mova                  m0, [srcq +%2*2]
99%if %1 == 1
100  movu                  m4, [ref1q+%3*2]
101  movu                  m5, [ref2q+%3*2]
102  movu                  m6, [ref3q+%3*2]
103  movu                  m7, [ref4q+%3*2]
104  mova                  m3, m0
105  mova                  m2, m0
106  psubusw               m3, m4
107  psubusw               m2, m5
108  psubusw               m4, m0
109  psubusw               m5, m0
110  por                   m4, m3
111  por                   m5, m2
112  pmaddwd               m4, m1
113  pmaddwd               m5, m1
114  mova                  m3, m0
115  mova                  m2, m0
116  psubusw               m3, m6
117  psubusw               m2, m7
118  psubusw               m6, m0
119  psubusw               m7, m0
120  por                   m6, m3
121  por                   m7, m2
122  pmaddwd               m6, m1
123  pmaddwd               m7, m1
124%else
125  mova                  m3, m0
126  movu                  m2, [ref1q+%3*2]
127  psubusw               m3, m2
128  psubusw               m2, m0
129  por                   m2, m3
130  mova                  m3, m0
131  pmaddwd               m2, m1
132  paddd                 m4, m2
133  movu                  m2, [ref2q+%3*2]
134  psubusw               m3, m2
135  psubusw               m2, m0
136  por                   m2, m3
137  mova                  m3, m0
138  pmaddwd               m2, m1
139  paddd                 m5, m2
140  movu                  m2, [ref3q+%3*2]
141  psubusw               m3, m2
142  psubusw               m2, m0
143  por                   m2, m3
144  mova                  m3, m0
145  pmaddwd               m2, m1
146  paddd                 m6, m2
147  movu                  m2, [ref4q+%3*2]
148  psubusw               m3, m2
149  psubusw               m2, m0
150  por                   m2, m3
151  pmaddwd               m2, m1
152  paddd                 m7, m2
153%endif
154
155  ; 2nd 8 px
156  mova                  m0, [srcq +(%4)*2]
157  mova                  m3, m0
158  movu                  m2, [ref1q+(%5)*2]
159  psubusw               m3, m2
160  psubusw               m2, m0
161  por                   m2, m3
162  mova                  m3, m0
163  pmaddwd               m2, m1
164  paddd                 m4, m2
165  movu                  m2, [ref2q+(%5)*2]
166  psubusw               m3, m2
167  psubusw               m2, m0
168  por                   m2, m3
169  mova                  m3, m0
170  pmaddwd               m2, m1
171  paddd                 m5, m2
172  movu                  m2, [ref3q+(%5)*2]
173  psubusw               m3, m2
174  psubusw               m2, m0
175  por                   m2, m3
176  mova                  m3, m0
177  pmaddwd               m2, m1
178  paddd                 m6, m2
179  movu                  m2, [ref4q+(%5)*2]
180  psubusw               m3, m2
181  psubusw               m2, m0
182%if %6 == 1
183  lea                 srcq, [srcq +src_strideq*4]
184  lea                ref1q, [ref1q+ref_strideq*4]
185  lea                ref2q, [ref2q+ref_strideq*4]
186  lea                ref3q, [ref3q+ref_strideq*4]
187  lea                ref4q, [ref4q+ref_strideq*4]
188%endif
189  por                   m2, m3
190  pmaddwd               m2, m1
191  paddd                 m7, m2
192%endmacro
193
194; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
195%macro HIGH_PROCESS_16x2x4 5-6 0
196  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
197  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
198%endmacro
199
200; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
201%macro HIGH_PROCESS_32x2x4 5-6 0
202  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
203  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
204%endmacro
205
206; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
207%macro HIGH_PROCESS_64x2x4 5-6 0
208  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
209  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
210%endmacro
211
212; void vpx_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
213;                         uint8_t *ref[4], int ref_stride,
214;                         uint32_t res[4]);
215; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
216%macro HIGH_SADNXN4D 2
217%if UNIX64
218cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
219                              res, ref2, ref3, ref4
220%else
221cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
222                              ref2, ref3, ref4
223%endif
224
225; set m1
226  push                srcq
227  mov                 srcd, 0x00010001
228  movd                  m1, srcd
229  pshufd                m1, m1, 0x0
230  pop                 srcq
231
232  movsxdifnidn src_strideq, src_strided
233  movsxdifnidn ref_strideq, ref_strided
234  mov                ref2q, [ref1q+gprsize*1]
235  mov                ref3q, [ref1q+gprsize*2]
236  mov                ref4q, [ref1q+gprsize*3]
237  mov                ref1q, [ref1q+gprsize*0]
238
239; convert byte pointers to short pointers
240  shl                 srcq, 1
241  shl                ref2q, 1
242  shl                ref3q, 1
243  shl                ref4q, 1
244  shl                ref1q, 1
245
246  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
247%rep (%2-4)/2
248  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
249%endrep
250  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
251  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
252  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
253  movhlps               m0, m4
254  movhlps               m1, m5
255  movhlps               m2, m6
256  movhlps               m3, m7
257  paddd                 m4, m0
258  paddd                 m5, m1
259  paddd                 m6, m2
260  paddd                 m7, m3
261  punpckldq             m4, m5
262  punpckldq             m6, m7
263  movhlps               m0, m4
264  movhlps               m1, m6
265  paddd                 m4, m0
266  paddd                 m6, m1
267  punpcklqdq            m4, m6
268  movifnidn             r4, r4mp
269  movu                [r4], m4
270  RET
271%endmacro
272
273
274INIT_XMM sse2
275HIGH_SADNXN4D 64, 64
276HIGH_SADNXN4D 64, 32
277HIGH_SADNXN4D 32, 64
278HIGH_SADNXN4D 32, 32
279HIGH_SADNXN4D 32, 16
280HIGH_SADNXN4D 16, 32
281HIGH_SADNXN4D 16, 16
282HIGH_SADNXN4D 16,  8
283HIGH_SADNXN4D  8, 16
284HIGH_SADNXN4D  8,  8
285HIGH_SADNXN4D  8,  4
286HIGH_SADNXN4D  4,  8
287HIGH_SADNXN4D  4,  4
288