1;
2;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11;  This file is a duplicate of mfqe_sse2.asm in VP8.
12;  TODO(jackychen): Find a way to fix the duplicate.
13%include "vpx_ports/x86_abi_support.asm"
14
15;void vp9_filter_by_weight16x16_sse2
16;(
17;    unsigned char *src,
18;    int            src_stride,
19;    unsigned char *dst,
20;    int            dst_stride,
21;    int            src_weight
22;)
23global sym(vp9_filter_by_weight16x16_sse2) PRIVATE
24sym(vp9_filter_by_weight16x16_sse2):
25    push        rbp
26    mov         rbp, rsp
27    SHADOW_ARGS_TO_STACK 5
28    SAVE_XMM 6
29    GET_GOT     rbx
30    push        rsi
31    push        rdi
32    ; end prolog
33
34    movd        xmm0, arg(4)                ; src_weight
35    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
36    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
37
38    movdqa      xmm1, [GLOBAL(tMFQE)]
39    psubw       xmm1, xmm0                  ; dst_weight
40
41    mov         rax, arg(0)                 ; src
42    mov         rsi, arg(1)                 ; src_stride
43    mov         rdx, arg(2)                 ; dst
44    mov         rdi, arg(3)                 ; dst_stride
45
46    mov         rcx, 16                     ; loop count
47    pxor        xmm6, xmm6
48
49.combine:
50    movdqa      xmm2, [rax]
51    movdqa      xmm4, [rdx]
52    add         rax, rsi
53
54    ; src * src_weight
55    movdqa      xmm3, xmm2
56    punpcklbw   xmm2, xmm6
57    punpckhbw   xmm3, xmm6
58    pmullw      xmm2, xmm0
59    pmullw      xmm3, xmm0
60
61    ; dst * dst_weight
62    movdqa      xmm5, xmm4
63    punpcklbw   xmm4, xmm6
64    punpckhbw   xmm5, xmm6
65    pmullw      xmm4, xmm1
66    pmullw      xmm5, xmm1
67
68    ; sum, round and shift
69    paddw       xmm2, xmm4
70    paddw       xmm3, xmm5
71    paddw       xmm2, [GLOBAL(tMFQE_round)]
72    paddw       xmm3, [GLOBAL(tMFQE_round)]
73    psrlw       xmm2, 4
74    psrlw       xmm3, 4
75
76    packuswb    xmm2, xmm3
77    movdqa      [rdx], xmm2
78    add         rdx, rdi
79
80    dec         rcx
81    jnz         .combine
82
83    ; begin epilog
84    pop         rdi
85    pop         rsi
86    RESTORE_GOT
87    RESTORE_XMM
88    UNSHADOW_ARGS
89    pop         rbp
90
91    ret
92
93;void vp9_filter_by_weight8x8_sse2
94;(
95;    unsigned char *src,
96;    int            src_stride,
97;    unsigned char *dst,
98;    int            dst_stride,
99;    int            src_weight
100;)
101global sym(vp9_filter_by_weight8x8_sse2) PRIVATE
102sym(vp9_filter_by_weight8x8_sse2):
103    push        rbp
104    mov         rbp, rsp
105    SHADOW_ARGS_TO_STACK 5
106    GET_GOT     rbx
107    push        rsi
108    push        rdi
109    ; end prolog
110
111    movd        xmm0, arg(4)                ; src_weight
112    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
113    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
114
115    movdqa      xmm1, [GLOBAL(tMFQE)]
116    psubw       xmm1, xmm0                  ; dst_weight
117
118    mov         rax, arg(0)                 ; src
119    mov         rsi, arg(1)                 ; src_stride
120    mov         rdx, arg(2)                 ; dst
121    mov         rdi, arg(3)                 ; dst_stride
122
123    mov         rcx, 8                      ; loop count
124    pxor        xmm4, xmm4
125
126.combine:
127    movq        xmm2, [rax]
128    movq        xmm3, [rdx]
129    add         rax, rsi
130
131    ; src * src_weight
132    punpcklbw   xmm2, xmm4
133    pmullw      xmm2, xmm0
134
135    ; dst * dst_weight
136    punpcklbw   xmm3, xmm4
137    pmullw      xmm3, xmm1
138
139    ; sum, round and shift
140    paddw       xmm2, xmm3
141    paddw       xmm2, [GLOBAL(tMFQE_round)]
142    psrlw       xmm2, 4
143
144    packuswb    xmm2, xmm4
145    movq        [rdx], xmm2
146    add         rdx, rdi
147
148    dec         rcx
149    jnz         .combine
150
151    ; begin epilog
152    pop         rdi
153    pop         rsi
154    RESTORE_GOT
155    UNSHADOW_ARGS
156    pop         rbp
157
158    ret
159
160;void vp9_variance_and_sad_16x16_sse2 | arg
161;(
162;    unsigned char *src1,          0
163;    int            stride1,       1
164;    unsigned char *src2,          2
165;    int            stride2,       3
166;    unsigned int  *variance,      4
167;    unsigned int  *sad,           5
168;)
169global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE
170sym(vp9_variance_and_sad_16x16_sse2):
171    push        rbp
172    mov         rbp, rsp
173    SHADOW_ARGS_TO_STACK 6
174    GET_GOT     rbx
175    push        rsi
176    push        rdi
177    ; end prolog
178
179    mov         rax,        arg(0)          ; src1
180    mov         rcx,        arg(1)          ; stride1
181    mov         rdx,        arg(2)          ; src2
182    mov         rdi,        arg(3)          ; stride2
183
184    mov         rsi,        16              ; block height
185
186    ; Prep accumulator registers
187    pxor        xmm3, xmm3                  ; SAD
188    pxor        xmm4, xmm4                  ; sum of src2
189    pxor        xmm5, xmm5                  ; sum of src2^2
190
191    ; Because we're working with the actual output frames
192    ; we can't depend on any kind of data alignment.
193.accumulate:
194    movdqa      xmm0, [rax]                 ; src1
195    movdqa      xmm1, [rdx]                 ; src2
196    add         rax, rcx                    ; src1 + stride1
197    add         rdx, rdi                    ; src2 + stride2
198
199    ; SAD(src1, src2)
200    psadbw      xmm0, xmm1
201    paddusw     xmm3, xmm0
202
203    ; SUM(src2)
204    pxor        xmm2, xmm2
205    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
206    paddusw     xmm4, xmm2
207
208    ; pmaddubsw would be ideal if it took two unsigned values. instead,
209    ; it expects a signed and an unsigned value. so instead we zero extend
210    ; and operate on words.
211    pxor        xmm2, xmm2
212    movdqa      xmm0, xmm1
213    punpcklbw   xmm0, xmm2
214    punpckhbw   xmm1, xmm2
215    pmaddwd     xmm0, xmm0
216    pmaddwd     xmm1, xmm1
217    paddd       xmm5, xmm0
218    paddd       xmm5, xmm1
219
220    sub         rsi,        1
221    jnz         .accumulate
222
223    ; phaddd only operates on adjacent double words.
224    ; Finalize SAD and store
225    movdqa      xmm0, xmm3
226    psrldq      xmm0, 8
227    paddusw     xmm0, xmm3
228    paddd       xmm0, [GLOBAL(t128)]
229    psrld       xmm0, 8
230
231    mov         rax,  arg(5)
232    movd        [rax], xmm0
233
234    ; Accumulate sum of src2
235    movdqa      xmm0, xmm4
236    psrldq      xmm0, 8
237    paddusw     xmm0, xmm4
238    ; Square src2. Ignore high value
239    pmuludq     xmm0, xmm0
240    psrld       xmm0, 8
241
242    ; phaddw could be used to sum adjacent values but we want
243    ; all the values summed. promote to doubles, accumulate,
244    ; shift and sum
245    pxor        xmm2, xmm2
246    movdqa      xmm1, xmm5
247    punpckldq   xmm1, xmm2
248    punpckhdq   xmm5, xmm2
249    paddd       xmm1, xmm5
250    movdqa      xmm2, xmm1
251    psrldq      xmm1, 8
252    paddd       xmm1, xmm2
253
254    psubd       xmm1, xmm0
255
256    ; (variance + 128) >> 8
257    paddd       xmm1, [GLOBAL(t128)]
258    psrld       xmm1, 8
259    mov         rax,  arg(4)
260
261    movd        [rax], xmm1
262
263
264    ; begin epilog
265    pop         rdi
266    pop         rsi
267    RESTORE_GOT
268    UNSHADOW_ARGS
269    pop         rbp
270    ret
271
272SECTION_RODATA
273align 16
274t128:
275%ifndef __NASM_VER__
276    ddq 128
277%elif CONFIG_BIG_ENDIAN
278    dq  0, 128
279%else
280    dq  128, 0
281%endif
282align 16
283tMFQE: ; 1 << MFQE_PRECISION
284    times 8 dw 0x10
285align 16
286tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
287    times 8 dw 0x08
288