1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%include "third_party/x86inc/x86inc.asm"
12
13SECTION .text
14
15; void vp9_subtract_block(int rows, int cols,
16;                         int16_t *diff, ptrdiff_t diff_stride,
17;                         const uint8_t *src, ptrdiff_t src_stride,
18;                         const uint8_t *pred, ptrdiff_t pred_stride)
19
20INIT_XMM sse2
21cglobal subtract_block, 7, 7, 8, \
22                        rows, cols, diff, diff_stride, src, src_stride, \
23                        pred, pred_stride
24%define pred_str colsq
25  pxor                  m7, m7         ; dedicated zero register
26  cmp                colsd, 4
27  je .case_4
28  cmp                colsd, 8
29  je .case_8
30  cmp                colsd, 16
31  je .case_16
32  cmp                colsd, 32
33  je .case_32
34
35%macro loop16 6
36  mova                  m0, [srcq+%1]
37  mova                  m4, [srcq+%2]
38  mova                  m1, [predq+%3]
39  mova                  m5, [predq+%4]
40  punpckhbw             m2, m0, m7
41  punpckhbw             m3, m1, m7
42  punpcklbw             m0, m7
43  punpcklbw             m1, m7
44  psubw                 m2, m3
45  psubw                 m0, m1
46  punpckhbw             m1, m4, m7
47  punpckhbw             m3, m5, m7
48  punpcklbw             m4, m7
49  punpcklbw             m5, m7
50  psubw                 m1, m3
51  psubw                 m4, m5
52  mova [diffq+mmsize*0+%5], m0
53  mova [diffq+mmsize*1+%5], m2
54  mova [diffq+mmsize*0+%6], m4
55  mova [diffq+mmsize*1+%6], m1
56%endmacro
57
58  mov             pred_str, pred_stridemp
59.loop_64:
60  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
61  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
62  lea                diffq, [diffq+diff_strideq*2]
63  add                predq, pred_str
64  add                 srcq, src_strideq
65  dec                rowsd
66  jg .loop_64
67  RET
68
69.case_32:
70  mov             pred_str, pred_stridemp
71.loop_32:
72  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
73  lea                diffq, [diffq+diff_strideq*2]
74  add                predq, pred_str
75  add                 srcq, src_strideq
76  dec                rowsd
77  jg .loop_32
78  RET
79
80.case_16:
81  mov             pred_str, pred_stridemp
82.loop_16:
83  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
84  lea                diffq, [diffq+diff_strideq*4]
85  lea                predq, [predq+pred_str*2]
86  lea                 srcq, [srcq+src_strideq*2]
87  sub                rowsd, 2
88  jg .loop_16
89  RET
90
91%macro loop_h 0
92  movh                  m0, [srcq]
93  movh                  m2, [srcq+src_strideq]
94  movh                  m1, [predq]
95  movh                  m3, [predq+pred_str]
96  punpcklbw             m0, m7
97  punpcklbw             m1, m7
98  punpcklbw             m2, m7
99  punpcklbw             m3, m7
100  psubw                 m0, m1
101  psubw                 m2, m3
102  mova             [diffq], m0
103  mova [diffq+diff_strideq*2], m2
104%endmacro
105
106.case_8:
107  mov             pred_str, pred_stridemp
108.loop_8:
109  loop_h
110  lea                diffq, [diffq+diff_strideq*4]
111  lea                 srcq, [srcq+src_strideq*2]
112  lea                predq, [predq+pred_str*2]
113  sub                rowsd, 2
114  jg .loop_8
115  RET
116
117INIT_MMX
118.case_4:
119  mov             pred_str, pred_stridemp
120.loop_4:
121  loop_h
122  lea                diffq, [diffq+diff_strideq*4]
123  lea                 srcq, [srcq+src_strideq*2]
124  lea                predq, [predq+pred_str*2]
125  sub                rowsd, 2
126  jg .loop_4
127  RET
128