1233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
2233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
4233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  Use of this source code is governed by a BSD-style license
5233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  that can be found in the LICENSE file in the root of the source
6233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  tree. An additional intellectual property rights grant can be found
7233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  in the file PATENTS.  All contributing project authors may
8233d2500723e5594f3e7c70896ffeeef32b9c950ywan;  be found in the AUTHORS file in the root of the source tree.
9233d2500723e5594f3e7c70896ffeeef32b9c950ywan;
10233d2500723e5594f3e7c70896ffeeef32b9c950ywan
11233d2500723e5594f3e7c70896ffeeef32b9c950ywan%include "third_party/x86inc/x86inc.asm"
12233d2500723e5594f3e7c70896ffeeef32b9c950ywan
13233d2500723e5594f3e7c70896ffeeef32b9c950ywanSECTION .text
14233d2500723e5594f3e7c70896ffeeef32b9c950ywan
15233d2500723e5594f3e7c70896ffeeef32b9c950ywan; void vp9_subtract_block(int rows, int cols,
16233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                         int16_t *diff, ptrdiff_t diff_stride,
17233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                         const uint8_t *src, ptrdiff_t src_stride,
18233d2500723e5594f3e7c70896ffeeef32b9c950ywan;                         const uint8_t *pred, ptrdiff_t pred_stride)
19233d2500723e5594f3e7c70896ffeeef32b9c950ywan
20233d2500723e5594f3e7c70896ffeeef32b9c950ywanINIT_XMM sse2
21233d2500723e5594f3e7c70896ffeeef32b9c950ywancglobal subtract_block, 7, 7, 8, \
22233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        rows, cols, diff, diff_stride, src, src_stride, \
23233d2500723e5594f3e7c70896ffeeef32b9c950ywan                        pred, pred_stride
24233d2500723e5594f3e7c70896ffeeef32b9c950ywan%define pred_str colsq
25233d2500723e5594f3e7c70896ffeeef32b9c950ywan  pxor                  m7, m7         ; dedicated zero register
26233d2500723e5594f3e7c70896ffeeef32b9c950ywan  cmp                colsd, 4
27233d2500723e5594f3e7c70896ffeeef32b9c950ywan  je .case_4
28233d2500723e5594f3e7c70896ffeeef32b9c950ywan  cmp                colsd, 8
29233d2500723e5594f3e7c70896ffeeef32b9c950ywan  je .case_8
30233d2500723e5594f3e7c70896ffeeef32b9c950ywan  cmp                colsd, 16
31233d2500723e5594f3e7c70896ffeeef32b9c950ywan  je .case_16
32233d2500723e5594f3e7c70896ffeeef32b9c950ywan  cmp                colsd, 32
33233d2500723e5594f3e7c70896ffeeef32b9c950ywan  je .case_32
34233d2500723e5594f3e7c70896ffeeef32b9c950ywan
35233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro loop16 6
36233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova                  m0, [srcq+%1]
37233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova                  m4, [srcq+%2]
38233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova                  m1, [predq+%3]
39233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova                  m5, [predq+%4]
40233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpckhbw             m2, m0, m7
41233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpckhbw             m3, m1, m7
42233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpcklbw             m0, m7
43233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpcklbw             m1, m7
44233d2500723e5594f3e7c70896ffeeef32b9c950ywan  psubw                 m2, m3
45233d2500723e5594f3e7c70896ffeeef32b9c950ywan  psubw                 m0, m1
46233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpckhbw             m1, m4, m7
47233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpckhbw             m3, m5, m7
48233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpcklbw             m4, m7
49233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpcklbw             m5, m7
50233d2500723e5594f3e7c70896ffeeef32b9c950ywan  psubw                 m1, m3
51233d2500723e5594f3e7c70896ffeeef32b9c950ywan  psubw                 m4, m5
52233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova [diffq+mmsize*0+%5], m0
53233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova [diffq+mmsize*1+%5], m2
54233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova [diffq+mmsize*0+%6], m4
55233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova [diffq+mmsize*1+%6], m1
56233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
57233d2500723e5594f3e7c70896ffeeef32b9c950ywan
58233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mov             pred_str, pred_stridemp
59233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_64:
60233d2500723e5594f3e7c70896ffeeef32b9c950ywan  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
61233d2500723e5594f3e7c70896ffeeef32b9c950ywan  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
62233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                diffq, [diffq+diff_strideq*2]
63233d2500723e5594f3e7c70896ffeeef32b9c950ywan  add                predq, pred_str
64233d2500723e5594f3e7c70896ffeeef32b9c950ywan  add                 srcq, src_strideq
65233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dec                rowsd
66233d2500723e5594f3e7c70896ffeeef32b9c950ywan  jg .loop_64
67233d2500723e5594f3e7c70896ffeeef32b9c950ywan  RET
68233d2500723e5594f3e7c70896ffeeef32b9c950ywan
69233d2500723e5594f3e7c70896ffeeef32b9c950ywan.case_32:
70233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mov             pred_str, pred_stridemp
71233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_32:
72233d2500723e5594f3e7c70896ffeeef32b9c950ywan  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
73233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                diffq, [diffq+diff_strideq*2]
74233d2500723e5594f3e7c70896ffeeef32b9c950ywan  add                predq, pred_str
75233d2500723e5594f3e7c70896ffeeef32b9c950ywan  add                 srcq, src_strideq
76233d2500723e5594f3e7c70896ffeeef32b9c950ywan  dec                rowsd
77233d2500723e5594f3e7c70896ffeeef32b9c950ywan  jg .loop_32
78233d2500723e5594f3e7c70896ffeeef32b9c950ywan  RET
79233d2500723e5594f3e7c70896ffeeef32b9c950ywan
80233d2500723e5594f3e7c70896ffeeef32b9c950ywan.case_16:
81233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mov             pred_str, pred_stridemp
82233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_16:
83233d2500723e5594f3e7c70896ffeeef32b9c950ywan  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
84233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                diffq, [diffq+diff_strideq*4]
85233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                predq, [predq+pred_str*2]
86233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                 srcq, [srcq+src_strideq*2]
87233d2500723e5594f3e7c70896ffeeef32b9c950ywan  sub                rowsd, 2
88233d2500723e5594f3e7c70896ffeeef32b9c950ywan  jg .loop_16
89233d2500723e5594f3e7c70896ffeeef32b9c950ywan  RET
90233d2500723e5594f3e7c70896ffeeef32b9c950ywan
91233d2500723e5594f3e7c70896ffeeef32b9c950ywan%macro loop_h 0
92233d2500723e5594f3e7c70896ffeeef32b9c950ywan  movh                  m0, [srcq]
93233d2500723e5594f3e7c70896ffeeef32b9c950ywan  movh                  m2, [srcq+src_strideq]
94233d2500723e5594f3e7c70896ffeeef32b9c950ywan  movh                  m1, [predq]
95233d2500723e5594f3e7c70896ffeeef32b9c950ywan  movh                  m3, [predq+pred_str]
96233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpcklbw             m0, m7
97233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpcklbw             m1, m7
98233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpcklbw             m2, m7
99233d2500723e5594f3e7c70896ffeeef32b9c950ywan  punpcklbw             m3, m7
100233d2500723e5594f3e7c70896ffeeef32b9c950ywan  psubw                 m0, m1
101233d2500723e5594f3e7c70896ffeeef32b9c950ywan  psubw                 m2, m3
102233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova             [diffq], m0
103233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mova [diffq+diff_strideq*2], m2
104233d2500723e5594f3e7c70896ffeeef32b9c950ywan%endmacro
105233d2500723e5594f3e7c70896ffeeef32b9c950ywan
106233d2500723e5594f3e7c70896ffeeef32b9c950ywan.case_8:
107233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mov             pred_str, pred_stridemp
108233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_8:
109233d2500723e5594f3e7c70896ffeeef32b9c950ywan  loop_h
110233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                diffq, [diffq+diff_strideq*4]
111233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                 srcq, [srcq+src_strideq*2]
112233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                predq, [predq+pred_str*2]
113233d2500723e5594f3e7c70896ffeeef32b9c950ywan  sub                rowsd, 2
114233d2500723e5594f3e7c70896ffeeef32b9c950ywan  jg .loop_8
115233d2500723e5594f3e7c70896ffeeef32b9c950ywan  RET
116233d2500723e5594f3e7c70896ffeeef32b9c950ywan
117233d2500723e5594f3e7c70896ffeeef32b9c950ywanINIT_MMX
118233d2500723e5594f3e7c70896ffeeef32b9c950ywan.case_4:
119233d2500723e5594f3e7c70896ffeeef32b9c950ywan  mov             pred_str, pred_stridemp
120233d2500723e5594f3e7c70896ffeeef32b9c950ywan.loop_4:
121233d2500723e5594f3e7c70896ffeeef32b9c950ywan  loop_h
122233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                diffq, [diffq+diff_strideq*4]
123233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                 srcq, [srcq+src_strideq*2]
124233d2500723e5594f3e7c70896ffeeef32b9c950ywan  lea                predq, [predq+pred_str*2]
125233d2500723e5594f3e7c70896ffeeef32b9c950ywan  sub                rowsd, 2
126233d2500723e5594f3e7c70896ffeeef32b9c950ywan  jg .loop_4
127233d2500723e5594f3e7c70896ffeeef32b9c950ywan  RET
128