1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    .globl vp8_get8x8var_ppc
13    .globl vp8_get16x16var_ppc
14    .globl vp8_mse16x16_ppc
15    .globl vp8_variance16x16_ppc
16    .globl vp8_variance16x8_ppc
17    .globl vp8_variance8x16_ppc
18    .globl vp8_variance8x8_ppc
19    .globl vp8_variance4x4_ppc
20
21.macro load_aligned_16 V R O
22    lvsl    v3,  0, \R          ;# permutate value for alignment
23
24    lvx     v1,  0, \R
25    lvx     v2, \O, \R
26
27    vperm   \V, v1, v2, v3
28.endm
29
30.macro prologue
31    mfspr   r11, 256            ;# get old VRSAVE
32    oris    r12, r11, 0xffc0
33    mtspr   256, r12            ;# set VRSAVE
34
35    stwu    r1, -32(r1)         ;# create space on the stack
36
37    li      r10, 16             ;# load offset and loop counter
38
39    vspltisw v7, 0              ;# zero for merging
40    vspltisw v8, 0              ;# zero out total to start
41    vspltisw v9, 0              ;# zero out total for dif^2
42.endm
43
44.macro epilogue
45    addi    r1, r1, 32          ;# recover stack
46
47    mtspr   256, r11            ;# reset old VRSAVE
48.endm
49
50.macro compute_sum_sse
51    ;# Compute sum first.  Unpack to so signed subract
52    ;#  can be used.  Only have a half word signed
53    ;#  subract.  Do high, then low.
54    vmrghb  v2, v7, v4
55    vmrghb  v3, v7, v5
56    vsubshs v2, v2, v3
57    vsum4shs v8, v2, v8
58
59    vmrglb  v2, v7, v4
60    vmrglb  v3, v7, v5
61    vsubshs v2, v2, v3
62    vsum4shs v8, v2, v8
63
64    ;# Now compute sse.
65    vsububs v2, v4, v5
66    vsububs v3, v5, v4
67    vor     v2, v2, v3
68
69    vmsumubm v9, v2, v2, v9
70.endm
71
72.macro variance_16 DS loop_label store_sum
73\loop_label:
74    ;# only one of the inputs should need to be aligned.
75    load_aligned_16 v4, r3, r10
76    load_aligned_16 v5, r5, r10
77
78    ;# move onto the next line
79    add     r3, r3, r4
80    add     r5, r5, r6
81
82    compute_sum_sse
83
84    bdnz    \loop_label
85
86    vsumsws v8, v8, v7
87    vsumsws v9, v9, v7
88
89    stvx    v8, 0, r1
90    lwz     r3, 12(r1)
91
92    stvx    v9, 0, r1
93    lwz     r4, 12(r1)
94
95.if \store_sum
96    stw     r3, 0(r8)           ;# sum
97.endif
98    stw     r4, 0(r7)           ;# sse
99
100    mullw   r3, r3, r3          ;# sum*sum
101    srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
102    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
103.endm
104
105.macro variance_8 DS loop_label store_sum
106\loop_label:
107    ;# only one of the inputs should need to be aligned.
108    load_aligned_16 v4, r3, r10
109    load_aligned_16 v5, r5, r10
110
111    ;# move onto the next line
112    add     r3, r3, r4
113    add     r5, r5, r6
114
115    ;# only one of the inputs should need to be aligned.
116    load_aligned_16 v6, r3, r10
117    load_aligned_16 v0, r5, r10
118
119    ;# move onto the next line
120    add     r3, r3, r4
121    add     r5, r5, r6
122
123    vmrghb  v4, v4, v6
124    vmrghb  v5, v5, v0
125
126    compute_sum_sse
127
128    bdnz    \loop_label
129
130    vsumsws v8, v8, v7
131    vsumsws v9, v9, v7
132
133    stvx    v8, 0, r1
134    lwz     r3, 12(r1)
135
136    stvx    v9, 0, r1
137    lwz     r4, 12(r1)
138
139.if \store_sum
140    stw     r3, 0(r8)           ;# sum
141.endif
142    stw     r4, 0(r7)           ;# sse
143
144    mullw   r3, r3, r3          ;# sum*sum
145    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
146    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
147.endm
148
149    .align 2
150;# r3 unsigned char *src_ptr
151;# r4 int  source_stride
152;# r5 unsigned char *ref_ptr
153;# r6 int  recon_stride
154;# r7 unsigned int *SSE
155;# r8 int *Sum
156;#
157;# r3 return value
158vp8_get8x8var_ppc:
159
160    prologue
161
162    li      r9, 4
163    mtctr   r9
164
165    variance_8 6, get8x8var_loop, 1
166
167    epilogue
168
169    blr
170
171    .align 2
172;# r3 unsigned char *src_ptr
173;# r4 int  source_stride
174;# r5 unsigned char *ref_ptr
175;# r6 int  recon_stride
176;# r7 unsigned int *SSE
177;# r8 int *Sum
178;#
179;# r3 return value
180vp8_get16x16var_ppc:
181
182    prologue
183
184    mtctr   r10
185
186    variance_16 8, get16x16var_loop, 1
187
188    epilogue
189
190    blr
191
192    .align 2
193;# r3 unsigned char *src_ptr
194;# r4 int  source_stride
195;# r5 unsigned char *ref_ptr
196;# r6 int  recon_stride
197;# r7 unsigned int *sse
198;#
199;# r 3 return value
200vp8_mse16x16_ppc:
201    prologue
202
203    mtctr   r10
204
205mse16x16_loop:
206    ;# only one of the inputs should need to be aligned.
207    load_aligned_16 v4, r3, r10
208    load_aligned_16 v5, r5, r10
209
210    ;# move onto the next line
211    add     r3, r3, r4
212    add     r5, r5, r6
213
214    ;# Now compute sse.
215    vsububs v2, v4, v5
216    vsububs v3, v5, v4
217    vor     v2, v2, v3
218
219    vmsumubm v9, v2, v2, v9
220
221    bdnz    mse16x16_loop
222
223    vsumsws v9, v9, v7
224
225    stvx    v9, 0, r1
226    lwz     r3, 12(r1)
227
228    stvx    v9, 0, r1
229    lwz     r3, 12(r1)
230
231    stw     r3, 0(r7)           ;# sse
232
233    epilogue
234
235    blr
236
237    .align 2
238;# r3 unsigned char *src_ptr
239;# r4 int  source_stride
240;# r5 unsigned char *ref_ptr
241;# r6 int  recon_stride
242;# r7 unsigned int *sse
243;#
244;# r3 return value
245vp8_variance16x16_ppc:
246
247    prologue
248
249    mtctr   r10
250
251    variance_16 8, variance16x16_loop, 0
252
253    epilogue
254
255    blr
256
257    .align 2
258;# r3 unsigned char *src_ptr
259;# r4 int  source_stride
260;# r5 unsigned char *ref_ptr
261;# r6 int  recon_stride
262;# r7 unsigned int *sse
263;#
264;# r3 return value
265vp8_variance16x8_ppc:
266
267    prologue
268
269    li      r9, 8
270    mtctr   r9
271
272    variance_16 7, variance16x8_loop, 0
273
274    epilogue
275
276    blr
277
278    .align 2
279;# r3 unsigned char *src_ptr
280;# r4 int  source_stride
281;# r5 unsigned char *ref_ptr
282;# r6 int  recon_stride
283;# r7 unsigned int *sse
284;#
285;# r3 return value
286vp8_variance8x16_ppc:
287
288    prologue
289
290    li      r9, 8
291    mtctr   r9
292
293    variance_8 7, variance8x16_loop, 0
294
295    epilogue
296
297    blr
298
299    .align 2
300;# r3 unsigned char *src_ptr
301;# r4 int  source_stride
302;# r5 unsigned char *ref_ptr
303;# r6 int  recon_stride
304;# r7 unsigned int *sse
305;#
306;# r3 return value
307vp8_variance8x8_ppc:
308
309    prologue
310
311    li      r9, 4
312    mtctr   r9
313
314    variance_8 6, variance8x8_loop, 0
315
316    epilogue
317
318    blr
319
320.macro transfer_4x4 I P
321    lwz     r0, 0(\I)
322    add     \I, \I, \P
323
324    lwz     r10,0(\I)
325    add     \I, \I, \P
326
327    lwz     r8, 0(\I)
328    add     \I, \I, \P
329
330    lwz     r9, 0(\I)
331
332    stw     r0,  0(r1)
333    stw     r10, 4(r1)
334    stw     r8,  8(r1)
335    stw     r9, 12(r1)
336.endm
337
338    .align 2
339;# r3 unsigned char *src_ptr
340;# r4 int  source_stride
341;# r5 unsigned char *ref_ptr
342;# r6 int  recon_stride
343;# r7 unsigned int *sse
344;#
345;# r3 return value
346vp8_variance4x4_ppc:
347
348    prologue
349
350    transfer_4x4 r3, r4
351    lvx     v4, 0, r1
352
353    transfer_4x4 r5, r6
354    lvx     v5, 0, r1
355
356    compute_sum_sse
357
358    vsumsws v8, v8, v7
359    vsumsws v9, v9, v7
360
361    stvx    v8, 0, r1
362    lwz     r3, 12(r1)
363
364    stvx    v9, 0, r1
365    lwz     r4, 12(r1)
366
367    stw     r4, 0(r7)           ;# sse
368
369    mullw   r3, r3, r3          ;# sum*sum
370    srlwi   r3, r3, 4           ;# (sum*sum) >> 4
371    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
372
373    epilogue
374
375    blr
376