1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    .globl vp8_sad16x16_ppc
13    .globl vp8_sad16x8_ppc
14    .globl vp8_sad8x16_ppc
15    .globl vp8_sad8x8_ppc
16    .globl vp8_sad4x4_ppc
17
18.macro load_aligned_16 V R O
19    lvsl    v3,  0, \R          ;# permutate value for alignment
20
21    lvx     v1,  0, \R
22    lvx     v2, \O, \R
23
24    vperm   \V, v1, v2, v3
25.endm
26
27.macro prologue
28    mfspr   r11, 256            ;# get old VRSAVE
29    oris    r12, r11, 0xffc0
30    mtspr   256, r12            ;# set VRSAVE
31
32    stwu    r1, -32(r1)         ;# create space on the stack
33
34    li      r10, 16             ;# load offset and loop counter
35
36    vspltisw v8, 0              ;# zero out total to start
37.endm
38
39.macro epilogue
40    addi    r1, r1, 32          ;# recover stack
41
42    mtspr   256, r11            ;# reset old VRSAVE
43.endm
44
45.macro SAD_16
46    ;# v6 = abs (v4 - v5)
47    vsububs v6, v4, v5
48    vsububs v7, v5, v4
49    vor     v6, v6, v7
50
51    ;# v8 += abs (v4 - v5)
52    vsum4ubs v8, v6, v8
53.endm
54
55.macro sad_16_loop loop_label
56    lvsl    v3,  0, r5          ;# only needs to be done once per block
57
58    ;# preload a line of data before getting into the loop
59    lvx     v4, 0, r3
60    lvx     v1,  0, r5
61    lvx     v2, r10, r5
62
63    add     r5, r5, r6
64    add     r3, r3, r4
65
66    vperm   v5, v1, v2, v3
67
68    .align 4
69\loop_label:
70    ;# compute difference on first row
71    vsububs v6, v4, v5
72    vsububs v7, v5, v4
73
74    ;# load up next set of data
75    lvx     v9, 0, r3
76    lvx     v1,  0, r5
77    lvx     v2, r10, r5
78
79    ;# perform abs() of difference
80    vor     v6, v6, v7
81    add     r3, r3, r4
82
83    ;# add to the running tally
84    vsum4ubs v8, v6, v8
85
86    ;# now onto the next line
87    vperm   v5, v1, v2, v3
88    add     r5, r5, r6
89    lvx     v4, 0, r3
90
91    ;# compute difference on second row
92    vsububs v6, v9, v5
93    lvx     v1,  0, r5
94    vsububs v7, v5, v9
95    lvx     v2, r10, r5
96    vor     v6, v6, v7
97    add     r3, r3, r4
98    vsum4ubs v8, v6, v8
99    vperm   v5, v1, v2, v3
100    add     r5, r5, r6
101
102    bdnz    \loop_label
103
104    vspltisw v7, 0
105
106    vsumsws v8, v8, v7
107
108    stvx    v8, 0, r1
109    lwz     r3, 12(r1)
110.endm
111
112.macro sad_8_loop loop_label
113    .align 4
114\loop_label:
115    ;# only one of the inputs should need to be aligned.
116    load_aligned_16 v4, r3, r10
117    load_aligned_16 v5, r5, r10
118
119    ;# move onto the next line
120    add     r3, r3, r4
121    add     r5, r5, r6
122
123    ;# only one of the inputs should need to be aligned.
124    load_aligned_16 v6, r3, r10
125    load_aligned_16 v7, r5, r10
126
127    ;# move onto the next line
128    add     r3, r3, r4
129    add     r5, r5, r6
130
131    vmrghb  v4, v4, v6
132    vmrghb  v5, v5, v7
133
134    SAD_16
135
136    bdnz    \loop_label
137
138    vspltisw v7, 0
139
140    vsumsws v8, v8, v7
141
142    stvx    v8, 0, r1
143    lwz     r3, 12(r1)
144.endm
145
146    .align 2
147;# r3 unsigned char *src_ptr
148;# r4 int  src_stride
149;# r5 unsigned char *ref_ptr
150;# r6 int  ref_stride
151;#
152;# r3 return value
153vp8_sad16x16_ppc:
154
155    prologue
156
157    li      r9, 8
158    mtctr   r9
159
160    sad_16_loop sad16x16_loop
161
162    epilogue
163
164    blr
165
166    .align 2
167;# r3 unsigned char *src_ptr
168;# r4 int  src_stride
169;# r5 unsigned char *ref_ptr
170;# r6 int  ref_stride
171;#
172;# r3 return value
173vp8_sad16x8_ppc:
174
175    prologue
176
177    li      r9, 4
178    mtctr   r9
179
180    sad_16_loop sad16x8_loop
181
182    epilogue
183
184    blr
185
186    .align 2
187;# r3 unsigned char *src_ptr
188;# r4 int  src_stride
189;# r5 unsigned char *ref_ptr
190;# r6 int  ref_stride
191;#
192;# r3 return value
193vp8_sad8x16_ppc:
194
195    prologue
196
197    li      r9, 8
198    mtctr   r9
199
200    sad_8_loop sad8x16_loop
201
202    epilogue
203
204    blr
205
206    .align 2
207;# r3 unsigned char *src_ptr
208;# r4 int  src_stride
209;# r5 unsigned char *ref_ptr
210;# r6 int  ref_stride
211;#
212;# r3 return value
213vp8_sad8x8_ppc:
214
215    prologue
216
217    li      r9, 4
218    mtctr   r9
219
220    sad_8_loop sad8x8_loop
221
222    epilogue
223
224    blr
225
226.macro transfer_4x4 I P
227    lwz     r0, 0(\I)
228    add     \I, \I, \P
229
230    lwz     r7, 0(\I)
231    add     \I, \I, \P
232
233    lwz     r8, 0(\I)
234    add     \I, \I, \P
235
236    lwz     r9, 0(\I)
237
238    stw     r0,  0(r1)
239    stw     r7,  4(r1)
240    stw     r8,  8(r1)
241    stw     r9, 12(r1)
242.endm
243
244    .align 2
245;# r3 unsigned char *src_ptr
246;# r4 int  src_stride
247;# r5 unsigned char *ref_ptr
248;# r6 int  ref_stride
249;#
250;# r3 return value
251vp8_sad4x4_ppc:
252
253    prologue
254
255    transfer_4x4 r3, r4
256    lvx     v4, 0, r1
257
258    transfer_4x4 r5, r6
259    lvx     v5, 0, r1
260
261    vspltisw v8, 0              ;# zero out total to start
262
263    ;# v6 = abs (v4 - v5)
264    vsububs v6, v4, v5
265    vsububs v7, v5, v4
266    vor     v6, v6, v7
267
268    ;# v8 += abs (v4 - v5)
269    vsum4ubs v7, v6, v8
270    vsumsws v7, v7, v8
271
272    stvx    v7, 0, r1
273    lwz     r3, 12(r1)
274
275    epilogue
276
277    blr
278