1;
2;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11;  This file is a duplicate of mfqe_sse2.asm in VP8.
12;  TODO(jackychen): Find a way to fix the duplicate.
13%include "vpx_ports/x86_abi_support.asm"
14
15SECTION .text
16
17;void vp9_filter_by_weight16x16_sse2
18;(
19;    unsigned char *src,
20;    int            src_stride,
21;    unsigned char *dst,
22;    int            dst_stride,
23;    int            src_weight
24;)
25global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
26sym(vp9_filter_by_weight16x16_sse2):
27    push        rbp
28    mov         rbp, rsp
29    SHADOW_ARGS_TO_STACK 5
30    SAVE_XMM 6
31    GET_GOT     rbx
32    push        rsi
33    push        rdi
34    ; end prolog
35
36    movd        xmm0, arg(4)                ; src_weight
37    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
38    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
39
40    movdqa      xmm1, [GLOBAL(tMFQE)]
41    psubw       xmm1, xmm0                  ; dst_weight
42
43    mov         rax, arg(0)                 ; src
44    mov         rsi, arg(1)                 ; src_stride
45    mov         rdx, arg(2)                 ; dst
46    mov         rdi, arg(3)                 ; dst_stride
47
48    mov         rcx, 16                     ; loop count
49    pxor        xmm6, xmm6
50
51.combine:
52    movdqa      xmm2, [rax]
53    movdqa      xmm4, [rdx]
54    add         rax, rsi
55
56    ; src * src_weight
57    movdqa      xmm3, xmm2
58    punpcklbw   xmm2, xmm6
59    punpckhbw   xmm3, xmm6
60    pmullw      xmm2, xmm0
61    pmullw      xmm3, xmm0
62
63    ; dst * dst_weight
64    movdqa      xmm5, xmm4
65    punpcklbw   xmm4, xmm6
66    punpckhbw   xmm5, xmm6
67    pmullw      xmm4, xmm1
68    pmullw      xmm5, xmm1
69
70    ; sum, round and shift
71    paddw       xmm2, xmm4
72    paddw       xmm3, xmm5
73    paddw       xmm2, [GLOBAL(tMFQE_round)]
74    paddw       xmm3, [GLOBAL(tMFQE_round)]
75    psrlw       xmm2, 4
76    psrlw       xmm3, 4
77
78    packuswb    xmm2, xmm3
79    movdqa      [rdx], xmm2
80    add         rdx, rdi
81
82    dec         rcx
83    jnz         .combine
84
85    ; begin epilog
86    pop         rdi
87    pop         rsi
88    RESTORE_GOT
89    RESTORE_XMM
90    UNSHADOW_ARGS
91    pop         rbp
92
93    ret
94
95;void vp9_filter_by_weight8x8_sse2
96;(
97;    unsigned char *src,
98;    int            src_stride,
99;    unsigned char *dst,
100;    int            dst_stride,
101;    int            src_weight
102;)
103global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
104sym(vp9_filter_by_weight8x8_sse2):
105    push        rbp
106    mov         rbp, rsp
107    SHADOW_ARGS_TO_STACK 5
108    GET_GOT     rbx
109    push        rsi
110    push        rdi
111    ; end prolog
112
113    movd        xmm0, arg(4)                ; src_weight
114    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
115    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
116
117    movdqa      xmm1, [GLOBAL(tMFQE)]
118    psubw       xmm1, xmm0                  ; dst_weight
119
120    mov         rax, arg(0)                 ; src
121    mov         rsi, arg(1)                 ; src_stride
122    mov         rdx, arg(2)                 ; dst
123    mov         rdi, arg(3)                 ; dst_stride
124
125    mov         rcx, 8                      ; loop count
126    pxor        xmm4, xmm4
127
128.combine:
129    movq        xmm2, [rax]
130    movq        xmm3, [rdx]
131    add         rax, rsi
132
133    ; src * src_weight
134    punpcklbw   xmm2, xmm4
135    pmullw      xmm2, xmm0
136
137    ; dst * dst_weight
138    punpcklbw   xmm3, xmm4
139    pmullw      xmm3, xmm1
140
141    ; sum, round and shift
142    paddw       xmm2, xmm3
143    paddw       xmm2, [GLOBAL(tMFQE_round)]
144    psrlw       xmm2, 4
145
146    packuswb    xmm2, xmm4
147    movq        [rdx], xmm2
148    add         rdx, rdi
149
150    dec         rcx
151    jnz         .combine
152
153    ; begin epilog
154    pop         rdi
155    pop         rsi
156    RESTORE_GOT
157    UNSHADOW_ARGS
158    pop         rbp
159
160    ret
161
162;void vp9_variance_and_sad_16x16_sse2 | arg
163;(
164;    unsigned char *src1,          0
165;    int            stride1,       1
166;    unsigned char *src2,          2
167;    int            stride2,       3
168;    unsigned int  *variance,      4
169;    unsigned int  *sad,           5
170;)
171global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
172sym(vp9_variance_and_sad_16x16_sse2):
173    push        rbp
174    mov         rbp, rsp
175    SHADOW_ARGS_TO_STACK 6
176    GET_GOT     rbx
177    push        rsi
178    push        rdi
179    ; end prolog
180
181    mov         rax,        arg(0)          ; src1
182    mov         rcx,        arg(1)          ; stride1
183    mov         rdx,        arg(2)          ; src2
184    mov         rdi,        arg(3)          ; stride2
185
186    mov         rsi,        16              ; block height
187
188    ; Prep accumulator registers
189    pxor        xmm3, xmm3                  ; SAD
190    pxor        xmm4, xmm4                  ; sum of src2
191    pxor        xmm5, xmm5                  ; sum of src2^2
192
193    ; Because we're working with the actual output frames
194    ; we can't depend on any kind of data alignment.
195.accumulate:
196    movdqa      xmm0, [rax]                 ; src1
197    movdqa      xmm1, [rdx]                 ; src2
198    add         rax, rcx                    ; src1 + stride1
199    add         rdx, rdi                    ; src2 + stride2
200
201    ; SAD(src1, src2)
202    psadbw      xmm0, xmm1
203    paddusw     xmm3, xmm0
204
205    ; SUM(src2)
206    pxor        xmm2, xmm2
207    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
208    paddusw     xmm4, xmm2
209
210    ; pmaddubsw would be ideal if it took two unsigned values. instead,
211    ; it expects a signed and an unsigned value. so instead we zero extend
212    ; and operate on words.
213    pxor        xmm2, xmm2
214    movdqa      xmm0, xmm1
215    punpcklbw   xmm0, xmm2
216    punpckhbw   xmm1, xmm2
217    pmaddwd     xmm0, xmm0
218    pmaddwd     xmm1, xmm1
219    paddd       xmm5, xmm0
220    paddd       xmm5, xmm1
221
222    sub         rsi,        1
223    jnz         .accumulate
224
225    ; phaddd only operates on adjacent double words.
226    ; Finalize SAD and store
227    movdqa      xmm0, xmm3
228    psrldq      xmm0, 8
229    paddusw     xmm0, xmm3
230    paddd       xmm0, [GLOBAL(t128)]
231    psrld       xmm0, 8
232
233    mov         rax,  arg(5)
234    movd        [rax], xmm0
235
236    ; Accumulate sum of src2
237    movdqa      xmm0, xmm4
238    psrldq      xmm0, 8
239    paddusw     xmm0, xmm4
240    ; Square src2. Ignore high value
241    pmuludq     xmm0, xmm0
242    psrld       xmm0, 8
243
244    ; phaddw could be used to sum adjacent values but we want
245    ; all the values summed. promote to doubles, accumulate,
246    ; shift and sum
247    pxor        xmm2, xmm2
248    movdqa      xmm1, xmm5
249    punpckldq   xmm1, xmm2
250    punpckhdq   xmm5, xmm2
251    paddd       xmm1, xmm5
252    movdqa      xmm2, xmm1
253    psrldq      xmm1, 8
254    paddd       xmm1, xmm2
255
256    psubd       xmm1, xmm0
257
258    ; (variance + 128) >> 8
259    paddd       xmm1, [GLOBAL(t128)]
260    psrld       xmm1, 8
261    mov         rax,  arg(4)
262
263    movd        [rax], xmm1
264
265
266    ; begin epilog
267    pop         rdi
268    pop         rsi
269    RESTORE_GOT
270    UNSHADOW_ARGS
271    pop         rbp
272    ret
273
274SECTION_RODATA
275align 16
276t128:
277%ifndef __NASM_VER__
278    ddq 128
279%elif CONFIG_BIG_ENDIAN
280    dq  0, 128
281%else
282    dq  128, 0
283%endif
284align 16
285tMFQE: ; 1 << MFQE_PRECISION
286    times 8 dw 0x10
287align 16
288tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
289    times 8 dw 0x08
290