1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    .globl bilinear_predict4x4_ppc
13    .globl bilinear_predict8x4_ppc
14    .globl bilinear_predict8x8_ppc
15    .globl bilinear_predict16x16_ppc
16
17.macro load_c V, LABEL, OFF, R0, R1
18    lis     \R0, \LABEL@ha
19    la      \R1, \LABEL@l(\R0)
20    lvx     \V, \OFF, \R1
21.endm
22
23.macro load_vfilter V0, V1
24    load_c \V0, vfilter_b, r6, r9, r10
25
26    addi    r6,  r6, 16
27    lvx     \V1, r6, r10
28.endm
29
30.macro HProlog jump_label
31    ;# load up horizontal filter
32    slwi.   r5, r5, 4           ;# index into horizontal filter array
33
34    ;# index to the next set of vectors in the row.
35    li      r10, 16
36    li      r12, 32
37
38    ;# downshift by 7 ( divide by 128 ) at the end
39    vspltish v19, 7
40
41    ;# If there isn't any filtering to be done for the horizontal, then
42    ;#  just skip to the second pass.
43    beq     \jump_label
44
45    load_c v20, hfilter_b, r5, r9, r0
46
47    ;# setup constants
48    ;# v14 permutation value for alignment
49    load_c v28, b_hperm_b, 0, r9, r0
50
51    ;# rounding added in on the multiply
52    vspltisw v21, 8
53    vspltisw v18, 3
54    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
55
56    slwi.   r6, r6, 5           ;# index into vertical filter array
57.endm
58
59;# Filters a horizontal line
60;# expects:
61;#  r3  src_ptr
62;#  r4  pitch
63;#  r10 16
64;#  r12 32
65;#  v17 perm intput
66;#  v18 rounding
67;#  v19 shift
68;#  v20 filter taps
69;#  v21 tmp
70;#  v22 tmp
71;#  v23 tmp
72;#  v24 tmp
73;#  v25 tmp
74;#  v26 tmp
75;#  v27 tmp
76;#  v28 perm output
77;#
78.macro HFilter V
79    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
80    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
81
82    vmsummbm v24, v20, v24, v18
83    vmsummbm v25, v20, v25, v18
84
85    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
86
87    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
88
89    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
90.endm
91
92.macro hfilter_8 V, increment_counter
93    lvsl    v17,  0, r3         ;# permutate value for alignment
94
95    ;# input to filter is 9 bytes wide, output is 8 bytes.
96    lvx     v21,   0, r3
97    lvx     v22, r10, r3
98
99.if \increment_counter
100    add     r3, r3, r4
101.endif
102    vperm   v21, v21, v22, v17
103
104    HFilter \V
105.endm
106
107
108.macro load_and_align_8 V, increment_counter
109    lvsl    v17,  0, r3         ;# permutate value for alignment
110
111    ;# input to filter is 21 bytes wide, output is 16 bytes.
112    ;#  input will can span three vectors if not aligned correctly.
113    lvx     v21,   0, r3
114    lvx     v22, r10, r3
115
116.if \increment_counter
117    add     r3, r3, r4
118.endif
119
120    vperm   \V, v21, v22, v17
121.endm
122
123.macro write_aligned_8 V, increment_counter
124    stvx    \V,  0, r7
125
126.if \increment_counter
127    add     r7, r7, r8
128.endif
129.endm
130
131.macro vfilter_16 P0 P1
132    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
133    vadduhm v22, v18, v22
134    vmuloub v23, \P0, v20
135    vadduhm v23, v18, v23
136
137    vmuleub v24, \P1, v21
138    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
139    vmuloub v25, \P1, v21
140    vadduhm v23, v23, v25       ;# Ro = odds
141
142    vsrh    v22, v22, v19       ;# divide by 128
143    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
144    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
145    vmrglh  v23, v22, v23
146    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
147.endm
148
149
150.macro w_8x8 V, D, R, P
151    stvx    \V, 0, r1
152    lwz     \R, 0(r1)
153    stw     \R, 0(r7)
154    lwz     \R, 4(r1)
155    stw     \R, 4(r7)
156    add     \D, \D, \P
157.endm
158
159
160    .align 2
161;# r3 unsigned char * src
162;# r4 int src_pitch
163;# r5 int x_offset
164;# r6 int y_offset
165;# r7 unsigned char * dst
166;# r8 int dst_pitch
167bilinear_predict4x4_ppc:
168    mfspr   r11, 256            ;# get old VRSAVE
169    oris    r12, r11, 0xf830
170    ori     r12, r12, 0xfff8
171    mtspr   256, r12            ;# set VRSAVE
172
173    stwu    r1,-32(r1)          ;# create space on the stack
174
175    HProlog second_pass_4x4_pre_copy_b
176
177    ;# Load up permutation constants
178    load_c v10, b_0123_b, 0, r9, r12
179    load_c v11, b_4567_b, 0, r9, r12
180
181    hfilter_8 v0, 1
182    hfilter_8 v1, 1
183    hfilter_8 v2, 1
184    hfilter_8 v3, 1
185
186    ;# Finished filtering main horizontal block.  If there is no
187    ;#  vertical filtering, jump to storing the data.  Otherwise
188    ;#  load up and filter the additional line that is needed
189    ;#  for the vertical filter.
190    beq     store_out_4x4_b
191
192    hfilter_8 v4, 0
193
194    b   second_pass_4x4_b
195
196second_pass_4x4_pre_copy_b:
197    slwi    r6, r6, 5           ;# index into vertical filter array
198
199    load_and_align_8  v0, 1
200    load_and_align_8  v1, 1
201    load_and_align_8  v2, 1
202    load_and_align_8  v3, 1
203    load_and_align_8  v4, 1
204
205second_pass_4x4_b:
206    vspltish v20, 8
207    vspltish v18, 3
208    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
209
210    load_vfilter v20, v21
211
212    vfilter_16 v0,  v1
213    vfilter_16 v1,  v2
214    vfilter_16 v2,  v3
215    vfilter_16 v3,  v4
216
217store_out_4x4_b:
218
219    stvx    v0, 0, r1
220    lwz     r0, 0(r1)
221    stw     r0, 0(r7)
222    add     r7, r7, r8
223
224    stvx    v1, 0, r1
225    lwz     r0, 0(r1)
226    stw     r0, 0(r7)
227    add     r7, r7, r8
228
229    stvx    v2, 0, r1
230    lwz     r0, 0(r1)
231    stw     r0, 0(r7)
232    add     r7, r7, r8
233
234    stvx    v3, 0, r1
235    lwz     r0, 0(r1)
236    stw     r0, 0(r7)
237
238exit_4x4:
239
240    addi    r1, r1, 32          ;# recover stack
241    mtspr   256, r11            ;# reset old VRSAVE
242
243    blr
244
245    .align 2
246;# r3 unsigned char * src
247;# r4 int src_pitch
248;# r5 int x_offset
249;# r6 int y_offset
250;# r7 unsigned char * dst
251;# r8 int dst_pitch
252bilinear_predict8x4_ppc:
253    mfspr   r11, 256            ;# get old VRSAVE
254    oris    r12, r11, 0xf830
255    ori     r12, r12, 0xfff8
256    mtspr   256, r12            ;# set VRSAVE
257
258    stwu    r1,-32(r1)          ;# create space on the stack
259
260    HProlog second_pass_8x4_pre_copy_b
261
262    ;# Load up permutation constants
263    load_c v10, b_0123_b, 0, r9, r12
264    load_c v11, b_4567_b, 0, r9, r12
265
266    hfilter_8 v0, 1
267    hfilter_8 v1, 1
268    hfilter_8 v2, 1
269    hfilter_8 v3, 1
270
271    ;# Finished filtering main horizontal block.  If there is no
272    ;#  vertical filtering, jump to storing the data.  Otherwise
273    ;#  load up and filter the additional line that is needed
274    ;#  for the vertical filter.
275    beq     store_out_8x4_b
276
277    hfilter_8 v4, 0
278
279    b   second_pass_8x4_b
280
281second_pass_8x4_pre_copy_b:
282    slwi    r6, r6, 5           ;# index into vertical filter array
283
284    load_and_align_8  v0, 1
285    load_and_align_8  v1, 1
286    load_and_align_8  v2, 1
287    load_and_align_8  v3, 1
288    load_and_align_8  v4, 1
289
290second_pass_8x4_b:
291    vspltish v20, 8
292    vspltish v18, 3
293    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
294
295    load_vfilter v20, v21
296
297    vfilter_16 v0,  v1
298    vfilter_16 v1,  v2
299    vfilter_16 v2,  v3
300    vfilter_16 v3,  v4
301
302store_out_8x4_b:
303
304    cmpi    cr0, r8, 8
305    beq     cr0, store_aligned_8x4_b
306
307    w_8x8   v0, r7, r0, r8
308    w_8x8   v1, r7, r0, r8
309    w_8x8   v2, r7, r0, r8
310    w_8x8   v3, r7, r0, r8
311
312    b       exit_8x4
313
314store_aligned_8x4_b:
315    load_c v10, b_hilo_b, 0, r9, r10
316
317    vperm   v0, v0, v1, v10
318    vperm   v2, v2, v3, v10
319
320    stvx    v0, 0, r7
321    addi    r7, r7, 16
322    stvx    v2, 0, r7
323
324exit_8x4:
325
326    addi    r1, r1, 32          ;# recover stack
327    mtspr   256, r11            ;# reset old VRSAVE
328
329    blr
330
331    .align 2
332;# r3 unsigned char * src
333;# r4 int src_pitch
334;# r5 int x_offset
335;# r6 int y_offset
336;# r7 unsigned char * dst
337;# r8 int dst_pitch
338bilinear_predict8x8_ppc:
339    mfspr   r11, 256            ;# get old VRSAVE
340    oris    r12, r11, 0xfff0
341    ori     r12, r12, 0xffff
342    mtspr   256, r12            ;# set VRSAVE
343
344    stwu    r1,-32(r1)          ;# create space on the stack
345
346    HProlog second_pass_8x8_pre_copy_b
347
348    ;# Load up permutation constants
349    load_c v10, b_0123_b, 0, r9, r12
350    load_c v11, b_4567_b, 0, r9, r12
351
352    hfilter_8 v0, 1
353    hfilter_8 v1, 1
354    hfilter_8 v2, 1
355    hfilter_8 v3, 1
356    hfilter_8 v4, 1
357    hfilter_8 v5, 1
358    hfilter_8 v6, 1
359    hfilter_8 v7, 1
360
361    ;# Finished filtering main horizontal block.  If there is no
362    ;#  vertical filtering, jump to storing the data.  Otherwise
363    ;#  load up and filter the additional line that is needed
364    ;#  for the vertical filter.
365    beq     store_out_8x8_b
366
367    hfilter_8 v8, 0
368
369    b   second_pass_8x8_b
370
371second_pass_8x8_pre_copy_b:
372    slwi    r6, r6, 5           ;# index into vertical filter array
373
374    load_and_align_8  v0, 1
375    load_and_align_8  v1, 1
376    load_and_align_8  v2, 1
377    load_and_align_8  v3, 1
378    load_and_align_8  v4, 1
379    load_and_align_8  v5, 1
380    load_and_align_8  v6, 1
381    load_and_align_8  v7, 1
382    load_and_align_8  v8, 0
383
384second_pass_8x8_b:
385    vspltish v20, 8
386    vspltish v18, 3
387    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
388
389    load_vfilter v20, v21
390
391    vfilter_16 v0,  v1
392    vfilter_16 v1,  v2
393    vfilter_16 v2,  v3
394    vfilter_16 v3,  v4
395    vfilter_16 v4,  v5
396    vfilter_16 v5,  v6
397    vfilter_16 v6,  v7
398    vfilter_16 v7,  v8
399
400store_out_8x8_b:
401
402    cmpi    cr0, r8, 8
403    beq     cr0, store_aligned_8x8_b
404
405    w_8x8   v0, r7, r0, r8
406    w_8x8   v1, r7, r0, r8
407    w_8x8   v2, r7, r0, r8
408    w_8x8   v3, r7, r0, r8
409    w_8x8   v4, r7, r0, r8
410    w_8x8   v5, r7, r0, r8
411    w_8x8   v6, r7, r0, r8
412    w_8x8   v7, r7, r0, r8
413
414    b       exit_8x8
415
416store_aligned_8x8_b:
417    load_c v10, b_hilo_b, 0, r9, r10
418
419    vperm   v0, v0, v1, v10
420    vperm   v2, v2, v3, v10
421    vperm   v4, v4, v5, v10
422    vperm   v6, v6, v7, v10
423
424    stvx    v0, 0, r7
425    addi    r7, r7, 16
426    stvx    v2, 0, r7
427    addi    r7, r7, 16
428    stvx    v4, 0, r7
429    addi    r7, r7, 16
430    stvx    v6, 0, r7
431
432exit_8x8:
433
434    addi    r1, r1, 32          ;# recover stack
435    mtspr   256, r11            ;# reset old VRSAVE
436
437    blr
438
439;# Filters a horizontal line
440;# expects:
441;#  r3  src_ptr
442;#  r4  pitch
443;#  r10 16
444;#  r12 32
445;#  v17 perm intput
446;#  v18 rounding
447;#  v19 shift
448;#  v20 filter taps
449;#  v21 tmp
450;#  v22 tmp
451;#  v23 tmp
452;#  v24 tmp
453;#  v25 tmp
454;#  v26 tmp
455;#  v27 tmp
456;#  v28 perm output
457;#
458.macro hfilter_16 V, increment_counter
459
460    lvsl    v17,  0, r3         ;# permutate value for alignment
461
462    ;# input to filter is 21 bytes wide, output is 16 bytes.
463    ;#  input will can span three vectors if not aligned correctly.
464    lvx     v21,   0, r3
465    lvx     v22, r10, r3
466    lvx     v23, r12, r3
467
468.if \increment_counter
469    add     r3, r3, r4
470.endif
471    vperm   v21, v21, v22, v17
472    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
473
474    ;# set 0
475    vmsummbm v24, v20, v21, v18 ;# taps times elements
476
477    ;# set 1
478    vsldoi  v23, v21, v22, 1
479    vmsummbm v25, v20, v23, v18
480
481    ;# set 2
482    vsldoi  v23, v21, v22, 2
483    vmsummbm v26, v20, v23, v18
484
485    ;# set 3
486    vsldoi  v23, v21, v22, 3
487    vmsummbm v27, v20, v23, v18
488
489    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
490    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
491
492    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
493    vsrh    v25, v25, v19
494
495    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
496    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
497.endm
498
499.macro load_and_align_16 V, increment_counter
500    lvsl    v17,  0, r3         ;# permutate value for alignment
501
502    ;# input to filter is 21 bytes wide, output is 16 bytes.
503    ;#  input will can span three vectors if not aligned correctly.
504    lvx     v21,   0, r3
505    lvx     v22, r10, r3
506
507.if \increment_counter
508    add     r3, r3, r4
509.endif
510
511    vperm   \V, v21, v22, v17
512.endm
513
514.macro write_16 V, increment_counter
515    stvx    \V,  0, r7
516
517.if \increment_counter
518    add     r7, r7, r8
519.endif
520.endm
521
522    .align 2
523;# r3 unsigned char * src
524;# r4 int src_pitch
525;# r5 int x_offset
526;# r6 int y_offset
527;# r7 unsigned char * dst
528;# r8 int dst_pitch
529bilinear_predict16x16_ppc:
530    mfspr   r11, 256            ;# get old VRSAVE
531    oris    r12, r11, 0xffff
532    ori     r12, r12, 0xfff8
533    mtspr   256, r12            ;# set VRSAVE
534
535    HProlog second_pass_16x16_pre_copy_b
536
537    hfilter_16 v0,  1
538    hfilter_16 v1,  1
539    hfilter_16 v2,  1
540    hfilter_16 v3,  1
541    hfilter_16 v4,  1
542    hfilter_16 v5,  1
543    hfilter_16 v6,  1
544    hfilter_16 v7,  1
545    hfilter_16 v8,  1
546    hfilter_16 v9,  1
547    hfilter_16 v10, 1
548    hfilter_16 v11, 1
549    hfilter_16 v12, 1
550    hfilter_16 v13, 1
551    hfilter_16 v14, 1
552    hfilter_16 v15, 1
553
554    ;# Finished filtering main horizontal block.  If there is no
555    ;#  vertical filtering, jump to storing the data.  Otherwise
556    ;#  load up and filter the additional line that is needed
557    ;#  for the vertical filter.
558    beq     store_out_16x16_b
559
560    hfilter_16 v16, 0
561
562    b   second_pass_16x16_b
563
564second_pass_16x16_pre_copy_b:
565    slwi    r6, r6, 5           ;# index into vertical filter array
566
567    load_and_align_16  v0,  1
568    load_and_align_16  v1,  1
569    load_and_align_16  v2,  1
570    load_and_align_16  v3,  1
571    load_and_align_16  v4,  1
572    load_and_align_16  v5,  1
573    load_and_align_16  v6,  1
574    load_and_align_16  v7,  1
575    load_and_align_16  v8,  1
576    load_and_align_16  v9,  1
577    load_and_align_16  v10, 1
578    load_and_align_16  v11, 1
579    load_and_align_16  v12, 1
580    load_and_align_16  v13, 1
581    load_and_align_16  v14, 1
582    load_and_align_16  v15, 1
583    load_and_align_16  v16, 0
584
585second_pass_16x16_b:
586    vspltish v20, 8
587    vspltish v18, 3
588    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
589
590    load_vfilter v20, v21
591
592    vfilter_16 v0,  v1
593    vfilter_16 v1,  v2
594    vfilter_16 v2,  v3
595    vfilter_16 v3,  v4
596    vfilter_16 v4,  v5
597    vfilter_16 v5,  v6
598    vfilter_16 v6,  v7
599    vfilter_16 v7,  v8
600    vfilter_16 v8,  v9
601    vfilter_16 v9,  v10
602    vfilter_16 v10, v11
603    vfilter_16 v11, v12
604    vfilter_16 v12, v13
605    vfilter_16 v13, v14
606    vfilter_16 v14, v15
607    vfilter_16 v15, v16
608
609store_out_16x16_b:
610
611    write_16 v0,  1
612    write_16 v1,  1
613    write_16 v2,  1
614    write_16 v3,  1
615    write_16 v4,  1
616    write_16 v5,  1
617    write_16 v6,  1
618    write_16 v7,  1
619    write_16 v8,  1
620    write_16 v9,  1
621    write_16 v10, 1
622    write_16 v11, 1
623    write_16 v12, 1
624    write_16 v13, 1
625    write_16 v14, 1
626    write_16 v15, 0
627
628    mtspr   256, r11            ;# reset old VRSAVE
629
630    blr
631
632    .data
633
634    .align 4
635hfilter_b:
636    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
637    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
638    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
639    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
640    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
641    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
642    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
643    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
644
645    .align 4
646vfilter_b:
647    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
648    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
649    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
650    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
651    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
652    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
653    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
654    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
655    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
656    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
657    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
658    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
659    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
660    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
661    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
662    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
663
664    .align 4
665b_hperm_b:
666    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
667
668    .align 4
669b_0123_b:
670    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
671
672    .align 4
673b_4567_b:
674    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
675
676b_hilo_b:
677    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
678