1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    .globl vp8_sub_pixel_variance4x4_ppc
13    .globl vp8_sub_pixel_variance8x8_ppc
14    .globl vp8_sub_pixel_variance8x16_ppc
15    .globl vp8_sub_pixel_variance16x8_ppc
16    .globl vp8_sub_pixel_variance16x16_ppc
17
18.macro load_c V, LABEL, OFF, R0, R1
19    lis     \R0, \LABEL@ha
20    la      \R1, \LABEL@l(\R0)
21    lvx     \V, \OFF, \R1
22.endm
23
24.macro load_vfilter V0, V1
25    load_c \V0, vfilter_b, r6, r12, r10
26
27    addi    r6,  r6, 16
28    lvx     \V1, r6, r10
29.endm
30
31.macro HProlog jump_label
32    ;# load up horizontal filter
33    slwi.   r5, r5, 4           ;# index into horizontal filter array
34
35    ;# index to the next set of vectors in the row.
36    li      r10, 16
37
38    ;# downshift by 7 ( divide by 128 ) at the end
39    vspltish v19, 7
40
41    ;# If there isn't any filtering to be done for the horizontal, then
42    ;#  just skip to the second pass.
43    beq     \jump_label
44
45    load_c v20, hfilter_b, r5, r12, r0
46
47    ;# setup constants
48    ;# v14 permutation value for alignment
49    load_c v28, b_hperm_b, 0, r12, r0
50
51    ;# index to the next set of vectors in the row.
52    li      r12, 32
53
54    ;# rounding added in on the multiply
55    vspltisw v21, 8
56    vspltisw v18, 3
57    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
58
59    slwi.   r6, r6, 5           ;# index into vertical filter array
60.endm
61
62;# Filters a horizontal line
63;# expects:
64;#  r3  src_ptr
65;#  r4  pitch
66;#  r10 16
67;#  r12 32
68;#  v17 perm intput
69;#  v18 rounding
70;#  v19 shift
71;#  v20 filter taps
72;#  v21 tmp
73;#  v22 tmp
74;#  v23 tmp
75;#  v24 tmp
76;#  v25 tmp
77;#  v26 tmp
78;#  v27 tmp
79;#  v28 perm output
80;#
81
82.macro hfilter_8 V, hp, lp, increment_counter
83    lvsl    v17,  0, r3         ;# permutate value for alignment
84
85    ;# input to filter is 9 bytes wide, output is 8 bytes.
86    lvx     v21,   0, r3
87    lvx     v22, r10, r3
88
89.if \increment_counter
90    add     r3, r3, r4
91.endif
92    vperm   v21, v21, v22, v17
93
94    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
95    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
96
97    vmsummbm v24, v20, v24, v18
98    vmsummbm v25, v20, v25, v18
99
100    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
101
102    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
103
104    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
105.endm
106
107.macro vfilter_16 P0 P1
108    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
109    vadduhm v22, v18, v22
110    vmuloub v23, \P0, v20
111    vadduhm v23, v18, v23
112
113    vmuleub v24, \P1, v21
114    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
115    vmuloub v25, \P1, v21
116    vadduhm v23, v23, v25       ;# Ro = odds
117
118    vsrh    v22, v22, v19       ;# divide by 128
119    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
120    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
121    vmrglh  v23, v22, v23
122    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
123.endm
124
125.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
126    ;# Compute sum first.  Unpack to so signed subract
127    ;#  can be used.  Only have a half word signed
128    ;#  subract.  Do high, then low.
129    vmrghb  \t1, \z0, \src
130    vmrghb  \t2, \z0, \ref
131    vsubshs \t1, \t1, \t2
132    vsum4shs \sum, \t1, \sum
133
134    vmrglb  \t1, \z0, \src
135    vmrglb  \t2, \z0, \ref
136    vsubshs \t1, \t1, \t2
137    vsum4shs \sum, \t1, \sum
138
139    ;# Now compute sse.
140    vsububs \t1, \src, \ref
141    vsububs \t2, \ref, \src
142    vor     \t1, \t1, \t2
143
144    vmsumubm \sse, \t1, \t1, \sse
145.endm
146
147.macro variance_final sum, sse, z0, DS
148    vsumsws \sum, \sum, \z0
149    vsumsws \sse, \sse, \z0
150
151    stvx    \sum, 0, r1
152    lwz     r3, 12(r1)
153
154    stvx    \sse, 0, r1
155    lwz     r4, 12(r1)
156
157    stw     r4, 0(r9)           ;# sse
158
159    mullw   r3, r3, r3          ;# sum*sum
160    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
161    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
162.endm
163
164.macro compute_sum_sse_16 V, increment_counter
165    load_and_align_16  v16, r7, r8, \increment_counter
166    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
167.endm
168
169.macro load_and_align_16 V, R, P, increment_counter
170    lvsl    v17,  0, \R         ;# permutate value for alignment
171
172    ;# input to filter is 21 bytes wide, output is 16 bytes.
173    ;#  input will can span three vectors if not aligned correctly.
174    lvx     v21,   0, \R
175    lvx     v22, r10, \R
176
177.if \increment_counter
178    add     \R, \R, \P
179.endif
180
181    vperm   \V, v21, v22, v17
182.endm
183
184    .align 2
185;# r3 unsigned char  *src_ptr
186;# r4 int  src_pixels_per_line
187;# r5 int  xoffset
188;# r6 int  yoffset
189;# r7 unsigned char *dst_ptr
190;# r8 int dst_pixels_per_line
191;# r9 unsigned int *sse
192;#
193;# r3 return value
194vp8_sub_pixel_variance4x4_ppc:
195    mfspr   r11, 256            ;# get old VRSAVE
196    oris    r12, r11, 0xf830
197    ori     r12, r12, 0xfff8
198    mtspr   256, r12            ;# set VRSAVE
199
200    stwu    r1,-32(r1)          ;# create space on the stack
201
202    HProlog second_pass_4x4_pre_copy_b
203
204    ;# Load up permutation constants
205    load_c v10, b_0123_b, 0, r12, r0
206    load_c v11, b_4567_b, 0, r12, r0
207
208    hfilter_8 v0, v10, v11, 1
209    hfilter_8 v1, v10, v11, 1
210    hfilter_8 v2, v10, v11, 1
211    hfilter_8 v3, v10, v11, 1
212
213    ;# Finished filtering main horizontal block.  If there is no
214    ;#  vertical filtering, jump to storing the data.  Otherwise
215    ;#  load up and filter the additional line that is needed
216    ;#  for the vertical filter.
217    beq     compute_sum_sse_4x4_b
218
219    hfilter_8 v4, v10, v11, 0
220
221    b   second_pass_4x4_b
222
223second_pass_4x4_pre_copy_b:
224    slwi    r6, r6, 5           ;# index into vertical filter array
225
226    load_and_align_16 v0, r3, r4, 1
227    load_and_align_16 v1, r3, r4, 1
228    load_and_align_16 v2, r3, r4, 1
229    load_and_align_16 v3, r3, r4, 1
230    load_and_align_16 v4, r3, r4, 0
231
232second_pass_4x4_b:
233    vspltish v20, 8
234    vspltish v18, 3
235    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
236
237    load_vfilter v20, v21
238
239    vfilter_16 v0,  v1
240    vfilter_16 v1,  v2
241    vfilter_16 v2,  v3
242    vfilter_16 v3,  v4
243
244compute_sum_sse_4x4_b:
245    vspltish v18, 0             ;# sum
246    vspltish v19, 0             ;# sse
247    vspltish v23, 0             ;# unpack
248    li      r10, 16
249
250    load_and_align_16 v4, r7, r8, 1
251    load_and_align_16 v5, r7, r8, 1
252    load_and_align_16 v6, r7, r8, 1
253    load_and_align_16 v7, r7, r8, 1
254
255    vmrghb  v0, v0, v1
256    vmrghb  v1, v2, v3
257
258    vmrghb  v2, v4, v5
259    vmrghb  v3, v6, v7
260
261    load_c v10, b_hilo_b, 0, r12, r0
262
263    vperm   v0, v0, v1, v10
264    vperm   v1, v2, v3, v10
265
266    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
267
268    variance_final v18, v19, v23, 4
269
270    addi    r1, r1, 32          ;# recover stack
271    mtspr   256, r11            ;# reset old VRSAVE
272
273    blr
274
275    .align 2
276;# r3 unsigned char  *src_ptr
277;# r4 int  src_pixels_per_line
278;# r5 int  xoffset
279;# r6 int  yoffset
280;# r7 unsigned char *dst_ptr
281;# r8 int dst_pixels_per_line
282;# r9 unsigned int *sse
283;#
284;# r3 return value
285vp8_sub_pixel_variance8x8_ppc:
286    mfspr   r11, 256            ;# get old VRSAVE
287    oris    r12, r11, 0xfff0
288    ori     r12, r12, 0xffff
289    mtspr   256, r12            ;# set VRSAVE
290
291    stwu    r1,-32(r1)          ;# create space on the stack
292
293    HProlog second_pass_8x8_pre_copy_b
294
295    ;# Load up permutation constants
296    load_c v10, b_0123_b, 0, r12, r0
297    load_c v11, b_4567_b, 0, r12, r0
298
299    hfilter_8 v0, v10, v11, 1
300    hfilter_8 v1, v10, v11, 1
301    hfilter_8 v2, v10, v11, 1
302    hfilter_8 v3, v10, v11, 1
303    hfilter_8 v4, v10, v11, 1
304    hfilter_8 v5, v10, v11, 1
305    hfilter_8 v6, v10, v11, 1
306    hfilter_8 v7, v10, v11, 1
307
308    ;# Finished filtering main horizontal block.  If there is no
309    ;#  vertical filtering, jump to storing the data.  Otherwise
310    ;#  load up and filter the additional line that is needed
311    ;#  for the vertical filter.
312    beq     compute_sum_sse_8x8_b
313
314    hfilter_8 v8, v10, v11, 0
315
316    b   second_pass_8x8_b
317
318second_pass_8x8_pre_copy_b:
319    slwi.   r6, r6, 5           ;# index into vertical filter array
320
321    load_and_align_16 v0, r3, r4, 1
322    load_and_align_16 v1, r3, r4, 1
323    load_and_align_16 v2, r3, r4, 1
324    load_and_align_16 v3, r3, r4, 1
325    load_and_align_16 v4, r3, r4, 1
326    load_and_align_16 v5, r3, r4, 1
327    load_and_align_16 v6, r3, r4, 1
328    load_and_align_16 v7, r3, r4, 1
329    load_and_align_16 v8, r3, r4, 0
330
331    beq     compute_sum_sse_8x8_b
332
333second_pass_8x8_b:
334    vspltish v20, 8
335    vspltish v18, 3
336    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
337
338    load_vfilter v20, v21
339
340    vfilter_16 v0, v1
341    vfilter_16 v1, v2
342    vfilter_16 v2, v3
343    vfilter_16 v3, v4
344    vfilter_16 v4, v5
345    vfilter_16 v5, v6
346    vfilter_16 v6, v7
347    vfilter_16 v7, v8
348
349compute_sum_sse_8x8_b:
350    vspltish v18, 0             ;# sum
351    vspltish v19, 0             ;# sse
352    vspltish v23, 0             ;# unpack
353    li      r10, 16
354
355    vmrghb  v0, v0, v1
356    vmrghb  v1, v2, v3
357    vmrghb  v2, v4, v5
358    vmrghb  v3, v6, v7
359
360    load_and_align_16 v4,  r7, r8, 1
361    load_and_align_16 v5,  r7, r8, 1
362    load_and_align_16 v6,  r7, r8, 1
363    load_and_align_16 v7,  r7, r8, 1
364    load_and_align_16 v8,  r7, r8, 1
365    load_and_align_16 v9,  r7, r8, 1
366    load_and_align_16 v10, r7, r8, 1
367    load_and_align_16 v11, r7, r8, 0
368
369    vmrghb  v4, v4,  v5
370    vmrghb  v5, v6,  v7
371    vmrghb  v6, v8,  v9
372    vmrghb  v7, v10, v11
373
374    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
375    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
376    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
377    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
378
379    variance_final v18, v19, v23, 6
380
381    addi    r1, r1, 32          ;# recover stack
382    mtspr   256, r11            ;# reset old VRSAVE
383    blr
384
385    .align 2
386;# r3 unsigned char  *src_ptr
387;# r4 int  src_pixels_per_line
388;# r5 int  xoffset
389;# r6 int  yoffset
390;# r7 unsigned char *dst_ptr
391;# r8 int dst_pixels_per_line
392;# r9 unsigned int *sse
393;#
394;# r3 return value
395vp8_sub_pixel_variance8x16_ppc:
396    mfspr   r11, 256            ;# get old VRSAVE
397    oris    r12, r11, 0xffff
398    ori     r12, r12, 0xfffc
399    mtspr   256, r12            ;# set VRSAVE
400
401    stwu    r1,-32(r1)          ;# create space on the stack
402
403    HProlog second_pass_8x16_pre_copy_b
404
405    ;# Load up permutation constants
406    load_c v29, b_0123_b, 0, r12, r0
407    load_c v30, b_4567_b, 0, r12, r0
408
409    hfilter_8 v0,  v29, v30, 1
410    hfilter_8 v1,  v29, v30, 1
411    hfilter_8 v2,  v29, v30, 1
412    hfilter_8 v3,  v29, v30, 1
413    hfilter_8 v4,  v29, v30, 1
414    hfilter_8 v5,  v29, v30, 1
415    hfilter_8 v6,  v29, v30, 1
416    hfilter_8 v7,  v29, v30, 1
417    hfilter_8 v8,  v29, v30, 1
418    hfilter_8 v9,  v29, v30, 1
419    hfilter_8 v10, v29, v30, 1
420    hfilter_8 v11, v29, v30, 1
421    hfilter_8 v12, v29, v30, 1
422    hfilter_8 v13, v29, v30, 1
423    hfilter_8 v14, v29, v30, 1
424    hfilter_8 v15, v29, v30, 1
425
426    ;# Finished filtering main horizontal block.  If there is no
427    ;#  vertical filtering, jump to storing the data.  Otherwise
428    ;#  load up and filter the additional line that is needed
429    ;#  for the vertical filter.
430    beq     compute_sum_sse_8x16_b
431
432    hfilter_8 v16, v29, v30, 0
433
434    b   second_pass_8x16_b
435
436second_pass_8x16_pre_copy_b:
437    slwi.   r6, r6, 5           ;# index into vertical filter array
438
439    load_and_align_16 v0,  r3, r4, 1
440    load_and_align_16 v1,  r3, r4, 1
441    load_and_align_16 v2,  r3, r4, 1
442    load_and_align_16 v3,  r3, r4, 1
443    load_and_align_16 v4,  r3, r4, 1
444    load_and_align_16 v5,  r3, r4, 1
445    load_and_align_16 v6,  r3, r4, 1
446    load_and_align_16 v7,  r3, r4, 1
447    load_and_align_16 v8,  r3, r4, 1
448    load_and_align_16 v9,  r3, r4, 1
449    load_and_align_16 v10, r3, r4, 1
450    load_and_align_16 v11, r3, r4, 1
451    load_and_align_16 v12, r3, r4, 1
452    load_and_align_16 v13, r3, r4, 1
453    load_and_align_16 v14, r3, r4, 1
454    load_and_align_16 v15, r3, r4, 1
455    load_and_align_16 v16, r3, r4, 0
456
457    beq     compute_sum_sse_8x16_b
458
459second_pass_8x16_b:
460    vspltish v20, 8
461    vspltish v18, 3
462    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
463
464    load_vfilter v20, v21
465
466    vfilter_16 v0,  v1
467    vfilter_16 v1,  v2
468    vfilter_16 v2,  v3
469    vfilter_16 v3,  v4
470    vfilter_16 v4,  v5
471    vfilter_16 v5,  v6
472    vfilter_16 v6,  v7
473    vfilter_16 v7,  v8
474    vfilter_16 v8,  v9
475    vfilter_16 v9,  v10
476    vfilter_16 v10, v11
477    vfilter_16 v11, v12
478    vfilter_16 v12, v13
479    vfilter_16 v13, v14
480    vfilter_16 v14, v15
481    vfilter_16 v15, v16
482
483compute_sum_sse_8x16_b:
484    vspltish v18, 0             ;# sum
485    vspltish v19, 0             ;# sse
486    vspltish v23, 0             ;# unpack
487    li      r10, 16
488
489    vmrghb  v0, v0,  v1
490    vmrghb  v1, v2,  v3
491    vmrghb  v2, v4,  v5
492    vmrghb  v3, v6,  v7
493    vmrghb  v4, v8,  v9
494    vmrghb  v5, v10, v11
495    vmrghb  v6, v12, v13
496    vmrghb  v7, v14, v15
497
498    load_and_align_16 v8,  r7, r8, 1
499    load_and_align_16 v9,  r7, r8, 1
500    load_and_align_16 v10, r7, r8, 1
501    load_and_align_16 v11, r7, r8, 1
502    load_and_align_16 v12, r7, r8, 1
503    load_and_align_16 v13, r7, r8, 1
504    load_and_align_16 v14, r7, r8, 1
505    load_and_align_16 v15, r7, r8, 1
506
507    vmrghb  v8,  v8,  v9
508    vmrghb  v9,  v10, v11
509    vmrghb  v10, v12, v13
510    vmrghb  v11, v14, v15
511
512    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
513    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
514    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
515    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
516
517    load_and_align_16 v8,  r7, r8, 1
518    load_and_align_16 v9,  r7, r8, 1
519    load_and_align_16 v10, r7, r8, 1
520    load_and_align_16 v11, r7, r8, 1
521    load_and_align_16 v12, r7, r8, 1
522    load_and_align_16 v13, r7, r8, 1
523    load_and_align_16 v14, r7, r8, 1
524    load_and_align_16 v15, r7, r8, 0
525
526    vmrghb  v8,  v8,  v9
527    vmrghb  v9,  v10, v11
528    vmrghb  v10, v12, v13
529    vmrghb  v11, v14, v15
530
531    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
532    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
533    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
534    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
535
536    variance_final v18, v19, v23, 7
537
538    addi    r1, r1, 32          ;# recover stack
539    mtspr   256, r11            ;# reset old VRSAVE
540    blr
541
542;# Filters a horizontal line
543;# expects:
544;#  r3  src_ptr
545;#  r4  pitch
546;#  r10 16
547;#  r12 32
548;#  v17 perm intput
549;#  v18 rounding
550;#  v19 shift
551;#  v20 filter taps
552;#  v21 tmp
553;#  v22 tmp
554;#  v23 tmp
555;#  v24 tmp
556;#  v25 tmp
557;#  v26 tmp
558;#  v27 tmp
559;#  v28 perm output
560;#
561.macro hfilter_16 V, increment_counter
562
563    lvsl    v17,  0, r3         ;# permutate value for alignment
564
565    ;# input to filter is 21 bytes wide, output is 16 bytes.
566    ;#  input will can span three vectors if not aligned correctly.
567    lvx     v21,   0, r3
568    lvx     v22, r10, r3
569    lvx     v23, r12, r3
570
571.if \increment_counter
572    add     r3, r3, r4
573.endif
574    vperm   v21, v21, v22, v17
575    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
576
577    ;# set 0
578    vmsummbm v24, v20, v21, v18 ;# taps times elements
579
580    ;# set 1
581    vsldoi  v23, v21, v22, 1
582    vmsummbm v25, v20, v23, v18
583
584    ;# set 2
585    vsldoi  v23, v21, v22, 2
586    vmsummbm v26, v20, v23, v18
587
588    ;# set 3
589    vsldoi  v23, v21, v22, 3
590    vmsummbm v27, v20, v23, v18
591
592    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
593    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
594
595    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
596    vsrh    v25, v25, v19
597
598    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
599    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
600.endm
601
602    .align 2
603;# r3 unsigned char  *src_ptr
604;# r4 int  src_pixels_per_line
605;# r5 int  xoffset
606;# r6 int  yoffset
607;# r7 unsigned char *dst_ptr
608;# r8 int dst_pixels_per_line
609;# r9 unsigned int *sse
610;#
611;# r3 return value
612vp8_sub_pixel_variance16x8_ppc:
613    mfspr   r11, 256            ;# get old VRSAVE
614    oris    r12, r11, 0xffff
615    ori     r12, r12, 0xfff8
616    mtspr   256, r12            ;# set VRSAVE
617
618    stwu    r1, -32(r1)         ;# create space on the stack
619
620    HProlog second_pass_16x8_pre_copy_b
621
622    hfilter_16 v0, 1
623    hfilter_16 v1, 1
624    hfilter_16 v2, 1
625    hfilter_16 v3, 1
626    hfilter_16 v4, 1
627    hfilter_16 v5, 1
628    hfilter_16 v6, 1
629    hfilter_16 v7, 1
630
631    ;# Finished filtering main horizontal block.  If there is no
632    ;#  vertical filtering, jump to storing the data.  Otherwise
633    ;#  load up and filter the additional line that is needed
634    ;#  for the vertical filter.
635    beq     compute_sum_sse_16x8_b
636
637    hfilter_16 v8, 0
638
639    b   second_pass_16x8_b
640
641second_pass_16x8_pre_copy_b:
642    slwi.   r6, r6, 5           ;# index into vertical filter array
643
644    load_and_align_16  v0,  r3, r4, 1
645    load_and_align_16  v1,  r3, r4, 1
646    load_and_align_16  v2,  r3, r4, 1
647    load_and_align_16  v3,  r3, r4, 1
648    load_and_align_16  v4,  r3, r4, 1
649    load_and_align_16  v5,  r3, r4, 1
650    load_and_align_16  v6,  r3, r4, 1
651    load_and_align_16  v7,  r3, r4, 1
652    load_and_align_16  v8,  r3, r4, 1
653
654    beq     compute_sum_sse_16x8_b
655
656second_pass_16x8_b:
657    vspltish v20, 8
658    vspltish v18, 3
659    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
660
661    load_vfilter v20, v21
662
663    vfilter_16 v0,  v1
664    vfilter_16 v1,  v2
665    vfilter_16 v2,  v3
666    vfilter_16 v3,  v4
667    vfilter_16 v4,  v5
668    vfilter_16 v5,  v6
669    vfilter_16 v6,  v7
670    vfilter_16 v7,  v8
671
672compute_sum_sse_16x8_b:
673    vspltish v18, 0             ;# sum
674    vspltish v19, 0             ;# sse
675    vspltish v23, 0             ;# unpack
676    li      r10, 16
677
678    compute_sum_sse_16 v0, 1
679    compute_sum_sse_16 v1, 1
680    compute_sum_sse_16 v2, 1
681    compute_sum_sse_16 v3, 1
682    compute_sum_sse_16 v4, 1
683    compute_sum_sse_16 v5, 1
684    compute_sum_sse_16 v6, 1
685    compute_sum_sse_16 v7, 0
686
687    variance_final v18, v19, v23, 7
688
689    addi    r1, r1, 32          ;# recover stack
690
691    mtspr   256, r11            ;# reset old VRSAVE
692
693    blr
694
695    .align 2
696;# r3 unsigned char  *src_ptr
697;# r4 int  src_pixels_per_line
698;# r5 int  xoffset
699;# r6 int  yoffset
700;# r7 unsigned char *dst_ptr
701;# r8 int dst_pixels_per_line
702;# r9 unsigned int *sse
703;#
704;# r3 return value
705vp8_sub_pixel_variance16x16_ppc:
706    mfspr   r11, 256            ;# get old VRSAVE
707    oris    r12, r11, 0xffff
708    ori     r12, r12, 0xfff8
709    mtspr   256, r12            ;# set VRSAVE
710
711    stwu    r1, -32(r1)         ;# create space on the stack
712
713    HProlog second_pass_16x16_pre_copy_b
714
715    hfilter_16 v0,  1
716    hfilter_16 v1,  1
717    hfilter_16 v2,  1
718    hfilter_16 v3,  1
719    hfilter_16 v4,  1
720    hfilter_16 v5,  1
721    hfilter_16 v6,  1
722    hfilter_16 v7,  1
723    hfilter_16 v8,  1
724    hfilter_16 v9,  1
725    hfilter_16 v10, 1
726    hfilter_16 v11, 1
727    hfilter_16 v12, 1
728    hfilter_16 v13, 1
729    hfilter_16 v14, 1
730    hfilter_16 v15, 1
731
732    ;# Finished filtering main horizontal block.  If there is no
733    ;#  vertical filtering, jump to storing the data.  Otherwise
734    ;#  load up and filter the additional line that is needed
735    ;#  for the vertical filter.
736    beq     compute_sum_sse_16x16_b
737
738    hfilter_16 v16, 0
739
740    b   second_pass_16x16_b
741
742second_pass_16x16_pre_copy_b:
743    slwi.   r6, r6, 5           ;# index into vertical filter array
744
745    load_and_align_16  v0,  r3, r4, 1
746    load_and_align_16  v1,  r3, r4, 1
747    load_and_align_16  v2,  r3, r4, 1
748    load_and_align_16  v3,  r3, r4, 1
749    load_and_align_16  v4,  r3, r4, 1
750    load_and_align_16  v5,  r3, r4, 1
751    load_and_align_16  v6,  r3, r4, 1
752    load_and_align_16  v7,  r3, r4, 1
753    load_and_align_16  v8,  r3, r4, 1
754    load_and_align_16  v9,  r3, r4, 1
755    load_and_align_16  v10, r3, r4, 1
756    load_and_align_16  v11, r3, r4, 1
757    load_and_align_16  v12, r3, r4, 1
758    load_and_align_16  v13, r3, r4, 1
759    load_and_align_16  v14, r3, r4, 1
760    load_and_align_16  v15, r3, r4, 1
761    load_and_align_16  v16, r3, r4, 0
762
763    beq     compute_sum_sse_16x16_b
764
765second_pass_16x16_b:
766    vspltish v20, 8
767    vspltish v18, 3
768    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
769
770    load_vfilter v20, v21
771
772    vfilter_16 v0,  v1
773    vfilter_16 v1,  v2
774    vfilter_16 v2,  v3
775    vfilter_16 v3,  v4
776    vfilter_16 v4,  v5
777    vfilter_16 v5,  v6
778    vfilter_16 v6,  v7
779    vfilter_16 v7,  v8
780    vfilter_16 v8,  v9
781    vfilter_16 v9,  v10
782    vfilter_16 v10, v11
783    vfilter_16 v11, v12
784    vfilter_16 v12, v13
785    vfilter_16 v13, v14
786    vfilter_16 v14, v15
787    vfilter_16 v15, v16
788
789compute_sum_sse_16x16_b:
790    vspltish v18, 0             ;# sum
791    vspltish v19, 0             ;# sse
792    vspltish v23, 0             ;# unpack
793    li      r10, 16
794
795    compute_sum_sse_16 v0,  1
796    compute_sum_sse_16 v1,  1
797    compute_sum_sse_16 v2,  1
798    compute_sum_sse_16 v3,  1
799    compute_sum_sse_16 v4,  1
800    compute_sum_sse_16 v5,  1
801    compute_sum_sse_16 v6,  1
802    compute_sum_sse_16 v7,  1
803    compute_sum_sse_16 v8,  1
804    compute_sum_sse_16 v9,  1
805    compute_sum_sse_16 v10, 1
806    compute_sum_sse_16 v11, 1
807    compute_sum_sse_16 v12, 1
808    compute_sum_sse_16 v13, 1
809    compute_sum_sse_16 v14, 1
810    compute_sum_sse_16 v15, 0
811
812    variance_final v18, v19, v23, 8
813
814    addi    r1, r1, 32          ;# recover stack
815
816    mtspr   256, r11            ;# reset old VRSAVE
817
818    blr
819
820    .data
821
822    .align 4
823hfilter_b:
824    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
825    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
826    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
827    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
828    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
829    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
830    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
831    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
832
833    .align 4
834vfilter_b:
835    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
836    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
837    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
838    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
839    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
840    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
841    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
842    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
843    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
844    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
845    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
846    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
847    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
848    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
849    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
850    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
851
852    .align 4
853b_hperm_b:
854    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
855
856    .align 4
857b_0123_b:
858    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
859
860    .align 4
861b_4567_b:
862    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
863
864b_hilo_b:
865    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
866