1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    .globl sixtap_predict_ppc
13    .globl sixtap_predict8x4_ppc
14    .globl sixtap_predict8x8_ppc
15    .globl sixtap_predict16x16_ppc
16
17.macro load_c V, LABEL, OFF, R0, R1
18    lis     \R0, \LABEL@ha
19    la      \R1, \LABEL@l(\R0)
20    lvx     \V, \OFF, \R1
21.endm
22
23.macro load_hfilter V0, V1
24    load_c \V0, HFilter, r5, r9, r10
25
26    addi    r5,  r5, 16
27    lvx     \V1, r5, r10
28.endm
29
30;# Vertical filtering
31.macro Vprolog
32    load_c v0, VFilter, r6, r3, r10
33
34    vspltish v5, 8
35    vspltish v6, 3
36    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
37
38    vspltb  v1, v0, 1
39    vspltb  v2, v0, 2
40    vspltb  v3, v0, 3
41    vspltb  v4, v0, 4
42    vspltb  v5, v0, 5
43    vspltb  v0, v0, 0
44.endm
45
46.macro vpre_load
47    Vprolog
48    li      r10,  16
49    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
50    lvx     v11, r10, r9
51    addi    r9,   r9, 32
52    lvx     v12,   0, r9
53    lvx     v13, r10, r9
54    addi    r9,   r9, 32
55    lvx     v14,   0, r9
56.endm
57
58.macro Msum Re, Ro, V, T, TMP
59                                ;# (Re,Ro) += (V*T)
60    vmuleub \TMP, \V, \T        ;# trashes v8
61    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
62    vmuloub \TMP, \V, \T
63    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
64.endm
65
66.macro vinterp_no_store P0 P1 P2 P3 P4 P5
67    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
68    vadduhm v16, v6, v8
69    vmuloub  v8, \P0, v0
70    vadduhm v17, v6, v8
71    Msum v16, v17, \P2, v2, v8
72    Msum v16, v17, \P3, v3, v8
73    Msum v16, v17, \P5, v5, v8
74
75    vmuleub v18, \P1, v1        ;# 2 negative taps
76    vmuloub v19, \P1, v1
77    Msum v18, v19, \P4, v4, v8
78
79    vsubuhs v16, v16, v18       ;# subtract neg from pos
80    vsubuhs v17, v17, v19
81    vsrh    v16, v16, v7        ;# divide by 128
82    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
83    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
84    vmrglh  v19, v16, v17
85    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
86.endm
87
88.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
89    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
90    vadduhm v21, v20, v24
91    vmuloub v24, \P0, v13
92    vadduhm v22, v20, v24
93    Msum v21, v22, \P2, v15, v25
94    Msum v21, v22, \P3, v16, v25
95    Msum v21, v22, \P5, v18, v25
96
97    vmuleub v23, \P1, v14       ;# 2 negative taps
98    vmuloub v24, \P1, v14
99    Msum v23, v24, \P4, v17, v25
100
101    vsubuhs v21, v21, v23       ;# subtract neg from pos
102    vsubuhs v22, v22, v24
103    vsrh    v21, v21, v19       ;# divide by 128
104    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
105    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
106    vmrglh  v24, v21, v22
107    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
108.endm
109
110
111.macro Vinterp P0 P1 P2 P3 P4 P5
112    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
113    stvx    \P0, 0, r7
114    add     r7, r7, r8      ;# 33 ops per 16 pels
115.endm
116
117
118.macro luma_v P0, P1, P2, P3, P4, P5
119    addi    r9,   r9, 16        ;# P5 = newest input row
120    lvx     \P5,   0, r9
121    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
122.endm
123
124.macro luma_vtwo
125    luma_v v10, v11, v12, v13, v14, v15
126    luma_v v11, v12, v13, v14, v15, v10
127.endm
128
129.macro luma_vfour
130    luma_vtwo
131    luma_v v12, v13, v14, v15, v10, v11
132    luma_v v13, v14, v15, v10, v11, v12
133.endm
134
135.macro luma_vsix
136    luma_vfour
137    luma_v v14, v15, v10, v11, v12, v13
138    luma_v v15, v10, v11, v12, v13, v14
139.endm
140
141.macro Interp4 R I I4
142    vmsummbm \R, v13, \I, v15
143    vmsummbm \R, v14, \I4, \R
144.endm
145
146.macro Read8x8 VD, RS, RP, increment_counter
147    lvsl    v21,  0, \RS        ;# permutate value for alignment
148
149    ;# input to filter is 21 bytes wide, output is 16 bytes.
150    ;#  input will can span three vectors if not aligned correctly.
151    lvx     \VD,   0, \RS
152    lvx     v20, r10, \RS
153
154.if \increment_counter
155    add     \RS, \RS, \RP
156.endif
157
158    vperm   \VD, \VD, v20, v21
159.endm
160
161.macro interp_8x8 R
162    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
163    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
164    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
165    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
166    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
167
168    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
169    vsrh    \R, \R, v19
170
171    vpkuhus \R, \R, \R          ;# saturate and pack
172
173.endm
174
175.macro Read4x4 VD, RS, RP, increment_counter
176    lvsl    v21,  0, \RS        ;# permutate value for alignment
177
178    ;# input to filter is 21 bytes wide, output is 16 bytes.
179    ;#  input will can span three vectors if not aligned correctly.
180    lvx     v20,   0, \RS
181
182.if \increment_counter
183    add     \RS, \RS, \RP
184.endif
185
186    vperm   \VD, v20, v20, v21
187.endm
188    .text
189
190    .align 2
191;# r3 unsigned char * src
192;# r4 int src_pitch
193;# r5 int x_offset
194;# r6 int y_offset
195;# r7 unsigned char * dst
196;# r8 int dst_pitch
197sixtap_predict_ppc:
198    mfspr   r11, 256            ;# get old VRSAVE
199    oris    r12, r11, 0xff87
200    ori     r12, r12, 0xffc0
201    mtspr   256, r12            ;# set VRSAVE
202
203    stwu    r1,-32(r1)          ;# create space on the stack
204
205    slwi.   r5, r5, 5           ;# index into horizontal filter array
206
207    vspltish v19, 7
208
209    ;# If there isn't any filtering to be done for the horizontal, then
210    ;#  just skip to the second pass.
211    beq-    vertical_only_4x4
212
213    ;# load up horizontal filter
214    load_hfilter v13, v14
215
216    ;# rounding added in on the multiply
217    vspltisw v16, 8
218    vspltisw v15, 3
219    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
220
221    ;# Load up permutation constants
222    load_c v16, B_0123, 0, r9, r10
223    load_c v17, B_4567, 0, r9, r10
224    load_c v18, B_89AB, 0, r9, r10
225
226    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
227    addi    r3, r3, -2
228
229    addi    r9, r3, 0
230    li      r10, 16
231    Read8x8 v2, r3, r4, 1
232    Read8x8 v3, r3, r4, 1
233    Read8x8 v4, r3, r4, 1
234    Read8x8 v5, r3, r4, 1
235
236    slwi.   r6, r6, 4           ;# index into vertical filter array
237
238    ;# filter a line
239    interp_8x8 v2
240    interp_8x8 v3
241    interp_8x8 v4
242    interp_8x8 v5
243
244    ;# Finished filtering main horizontal block.  If there is no
245    ;#  vertical filtering, jump to storing the data.  Otherwise
246    ;#  load up and filter the additional 5 lines that are needed
247    ;#  for the vertical filter.
248    beq-    store_4x4
249
250    ;# only needed if there is a vertical filter present
251    ;# if the second filter is not null then need to back off by 2*pitch
252    sub     r9, r9, r4
253    sub     r9, r9, r4
254
255    Read8x8 v0, r9, r4, 1
256    Read8x8 v1, r9, r4, 0
257    Read8x8 v6, r3, r4, 1
258    Read8x8 v7, r3, r4, 1
259    Read8x8 v8, r3, r4, 0
260
261    interp_8x8 v0
262    interp_8x8 v1
263    interp_8x8 v6
264    interp_8x8 v7
265    interp_8x8 v8
266
267    b       second_pass_4x4
268
269vertical_only_4x4:
270    ;# only needed if there is a vertical filter present
271    ;# if the second filter is not null then need to back off by 2*pitch
272    sub     r3, r3, r4
273    sub     r3, r3, r4
274    li      r10, 16
275
276    Read8x8 v0, r3, r4, 1
277    Read8x8 v1, r3, r4, 1
278    Read8x8 v2, r3, r4, 1
279    Read8x8 v3, r3, r4, 1
280    Read8x8 v4, r3, r4, 1
281    Read8x8 v5, r3, r4, 1
282    Read8x8 v6, r3, r4, 1
283    Read8x8 v7, r3, r4, 1
284    Read8x8 v8, r3, r4, 0
285
286    slwi    r6, r6, 4           ;# index into vertical filter array
287
288second_pass_4x4:
289    load_c   v20, b_hilo_4x4, 0, r9, r10
290    load_c   v21, b_hilo, 0, r9, r10
291
292    ;# reposition input so that it can go through the
293    ;# filtering phase with one pass.
294    vperm   v0, v0, v1, v20     ;# 0 1 x x
295    vperm   v2, v2, v3, v20     ;# 2 3 x x
296    vperm   v4, v4, v5, v20     ;# 4 5 x x
297    vperm   v6, v6, v7, v20     ;# 6 7 x x
298
299    vperm   v0, v0, v2, v21     ;# 0 1 2 3
300    vperm   v4, v4, v6, v21     ;# 4 5 6 7
301
302    vsldoi  v1, v0, v4, 4
303    vsldoi  v2, v0, v4, 8
304    vsldoi  v3, v0, v4, 12
305
306    vsldoi  v5, v4, v8, 4
307
308    load_c   v13, VFilter, r6, r9, r10
309
310    vspltish v15, 8
311    vspltish v20, 3
312    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
313
314    vspltb  v14, v13, 1
315    vspltb  v15, v13, 2
316    vspltb  v16, v13, 3
317    vspltb  v17, v13, 4
318    vspltb  v18, v13, 5
319    vspltb  v13, v13, 0
320
321    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
322
323    stvx    v0, 0, r1
324
325    lwz     r0, 0(r1)
326    stw     r0, 0(r7)
327    add     r7, r7, r8
328
329    lwz     r0, 4(r1)
330    stw     r0, 0(r7)
331    add     r7, r7, r8
332
333    lwz     r0, 8(r1)
334    stw     r0, 0(r7)
335    add     r7, r7, r8
336
337    lwz     r0, 12(r1)
338    stw     r0, 0(r7)
339
340    b       exit_4x4
341
342store_4x4:
343
344    stvx    v2, 0, r1
345    lwz     r0, 0(r1)
346    stw     r0, 0(r7)
347    add     r7, r7, r8
348
349    stvx    v3, 0, r1
350    lwz     r0, 0(r1)
351    stw     r0, 0(r7)
352    add     r7, r7, r8
353
354    stvx    v4, 0, r1
355    lwz     r0, 0(r1)
356    stw     r0, 0(r7)
357    add     r7, r7, r8
358
359    stvx    v5, 0, r1
360    lwz     r0, 0(r1)
361    stw     r0, 0(r7)
362
363exit_4x4:
364
365    addi    r1, r1, 32          ;# recover stack
366
367    mtspr   256, r11            ;# reset old VRSAVE
368
369    blr
370
371.macro w_8x8 V, D, R, P
372    stvx    \V, 0, r1
373    lwz     \R, 0(r1)
374    stw     \R, 0(r7)
375    lwz     \R, 4(r1)
376    stw     \R, 4(r7)
377    add     \D, \D, \P
378.endm
379
380    .align 2
381;# r3 unsigned char * src
382;# r4 int src_pitch
383;# r5 int x_offset
384;# r6 int y_offset
385;# r7 unsigned char * dst
386;# r8 int dst_pitch
387
388sixtap_predict8x4_ppc:
389    mfspr   r11, 256            ;# get old VRSAVE
390    oris    r12, r11, 0xffff
391    ori     r12, r12, 0xffc0
392    mtspr   256, r12            ;# set VRSAVE
393
394    stwu    r1,-32(r1)          ;# create space on the stack
395
396    slwi.   r5, r5, 5           ;# index into horizontal filter array
397
398    vspltish v19, 7
399
400    ;# If there isn't any filtering to be done for the horizontal, then
401    ;#  just skip to the second pass.
402    beq-    second_pass_pre_copy_8x4
403
404    load_hfilter v13, v14
405
406    ;# rounding added in on the multiply
407    vspltisw v16, 8
408    vspltisw v15, 3
409    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
410
411    ;# Load up permutation constants
412    load_c v16, B_0123, 0, r9, r10
413    load_c v17, B_4567, 0, r9, r10
414    load_c v18, B_89AB, 0, r9, r10
415
416    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
417    addi    r3, r3, -2
418
419    addi    r9, r3, 0
420    li      r10, 16
421    Read8x8 v2, r3, r4, 1
422    Read8x8 v3, r3, r4, 1
423    Read8x8 v4, r3, r4, 1
424    Read8x8 v5, r3, r4, 1
425
426    slwi.   r6, r6, 4           ;# index into vertical filter array
427
428    ;# filter a line
429    interp_8x8 v2
430    interp_8x8 v3
431    interp_8x8 v4
432    interp_8x8 v5
433
434    ;# Finished filtering main horizontal block.  If there is no
435    ;#  vertical filtering, jump to storing the data.  Otherwise
436    ;#  load up and filter the additional 5 lines that are needed
437    ;#  for the vertical filter.
438    beq-    store_8x4
439
440    ;# only needed if there is a vertical filter present
441    ;# if the second filter is not null then need to back off by 2*pitch
442    sub     r9, r9, r4
443    sub     r9, r9, r4
444
445    Read8x8 v0, r9, r4, 1
446    Read8x8 v1, r9, r4, 0
447    Read8x8 v6, r3, r4, 1
448    Read8x8 v7, r3, r4, 1
449    Read8x8 v8, r3, r4, 0
450
451    interp_8x8 v0
452    interp_8x8 v1
453    interp_8x8 v6
454    interp_8x8 v7
455    interp_8x8 v8
456
457    b       second_pass_8x4
458
459second_pass_pre_copy_8x4:
460    ;# only needed if there is a vertical filter present
461    ;# if the second filter is not null then need to back off by 2*pitch
462    sub     r3, r3, r4
463    sub     r3, r3, r4
464    li      r10, 16
465
466    Read8x8 v0,  r3, r4, 1
467    Read8x8 v1,  r3, r4, 1
468    Read8x8 v2,  r3, r4, 1
469    Read8x8 v3,  r3, r4, 1
470    Read8x8 v4,  r3, r4, 1
471    Read8x8 v5,  r3, r4, 1
472    Read8x8 v6,  r3, r4, 1
473    Read8x8 v7,  r3, r4, 1
474    Read8x8 v8,  r3, r4, 1
475
476    slwi    r6, r6, 4           ;# index into vertical filter array
477
478second_pass_8x4:
479    load_c v13, VFilter, r6, r9, r10
480
481    vspltish v15, 8
482    vspltish v20, 3
483    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
484
485    vspltb  v14, v13, 1
486    vspltb  v15, v13, 2
487    vspltb  v16, v13, 3
488    vspltb  v17, v13, 4
489    vspltb  v18, v13, 5
490    vspltb  v13, v13, 0
491
492    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
493    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
494    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
495    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
496
497    cmpi    cr0, r8, 8
498    beq     cr0, store_aligned_8x4
499
500    w_8x8   v0, r7, r0, r8
501    w_8x8   v1, r7, r0, r8
502    w_8x8   v2, r7, r0, r8
503    w_8x8   v3, r7, r0, r8
504
505    b       exit_8x4
506
507store_aligned_8x4:
508
509    load_c v10, b_hilo, 0, r9, r10
510
511    vperm   v0, v0, v1, v10
512    vperm   v2, v2, v3, v10
513
514    stvx    v0, 0, r7
515    addi    r7, r7, 16
516    stvx    v2, 0, r7
517
518    b       exit_8x4
519
520store_8x4:
521    cmpi    cr0, r8, 8
522    beq     cr0, store_aligned2_8x4
523
524    w_8x8   v2, r7, r0, r8
525    w_8x8   v3, r7, r0, r8
526    w_8x8   v4, r7, r0, r8
527    w_8x8   v5, r7, r0, r8
528
529    b       exit_8x4
530
531store_aligned2_8x4:
532    load_c v10, b_hilo, 0, r9, r10
533
534    vperm   v2, v2, v3, v10
535    vperm   v4, v4, v5, v10
536
537    stvx    v2, 0, r7
538    addi    r7, r7, 16
539    stvx    v4, 0, r7
540
541exit_8x4:
542
543    addi    r1, r1, 32          ;# recover stack
544
545    mtspr   256, r11            ;# reset old VRSAVE
546
547
548    blr
549
550    .align 2
551;# r3 unsigned char * src
552;# r4 int src_pitch
553;# r5 int x_offset
554;# r6 int y_offset
555;# r7 unsigned char * dst
556;# r8 int dst_pitch
557
558;# Because the width that needs to be filtered will fit in a single altivec
559;#  register there is no need to loop.  Everything can stay in registers.
560sixtap_predict8x8_ppc:
561    mfspr   r11, 256            ;# get old VRSAVE
562    oris    r12, r11, 0xffff
563    ori     r12, r12, 0xffc0
564    mtspr   256, r12            ;# set VRSAVE
565
566    stwu    r1,-32(r1)          ;# create space on the stack
567
568    slwi.   r5, r5, 5           ;# index into horizontal filter array
569
570    vspltish v19, 7
571
572    ;# If there isn't any filtering to be done for the horizontal, then
573    ;#  just skip to the second pass.
574    beq-    second_pass_pre_copy_8x8
575
576    load_hfilter v13, v14
577
578    ;# rounding added in on the multiply
579    vspltisw v16, 8
580    vspltisw v15, 3
581    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
582
583    ;# Load up permutation constants
584    load_c v16, B_0123, 0, r9, r10
585    load_c v17, B_4567, 0, r9, r10
586    load_c v18, B_89AB, 0, r9, r10
587
588    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
589    addi    r3, r3, -2
590
591    addi    r9, r3, 0
592    li      r10, 16
593    Read8x8 v2, r3, r4, 1
594    Read8x8 v3, r3, r4, 1
595    Read8x8 v4, r3, r4, 1
596    Read8x8 v5, r3, r4, 1
597    Read8x8 v6, r3, r4, 1
598    Read8x8 v7, r3, r4, 1
599    Read8x8 v8, r3, r4, 1
600    Read8x8 v9, r3, r4, 1
601
602    slwi.   r6, r6, 4           ;# index into vertical filter array
603
604    ;# filter a line
605    interp_8x8 v2
606    interp_8x8 v3
607    interp_8x8 v4
608    interp_8x8 v5
609    interp_8x8 v6
610    interp_8x8 v7
611    interp_8x8 v8
612    interp_8x8 v9
613
614    ;# Finished filtering main horizontal block.  If there is no
615    ;#  vertical filtering, jump to storing the data.  Otherwise
616    ;#  load up and filter the additional 5 lines that are needed
617    ;#  for the vertical filter.
618    beq-    store_8x8
619
620    ;# only needed if there is a vertical filter present
621    ;# if the second filter is not null then need to back off by 2*pitch
622    sub     r9, r9, r4
623    sub     r9, r9, r4
624
625    Read8x8 v0,  r9, r4, 1
626    Read8x8 v1,  r9, r4, 0
627    Read8x8 v10, r3, r4, 1
628    Read8x8 v11, r3, r4, 1
629    Read8x8 v12, r3, r4, 0
630
631    interp_8x8 v0
632    interp_8x8 v1
633    interp_8x8 v10
634    interp_8x8 v11
635    interp_8x8 v12
636
637    b       second_pass_8x8
638
639second_pass_pre_copy_8x8:
640    ;# only needed if there is a vertical filter present
641    ;# if the second filter is not null then need to back off by 2*pitch
642    sub     r3, r3, r4
643    sub     r3, r3, r4
644    li      r10, 16
645
646    Read8x8 v0,  r3, r4, 1
647    Read8x8 v1,  r3, r4, 1
648    Read8x8 v2,  r3, r4, 1
649    Read8x8 v3,  r3, r4, 1
650    Read8x8 v4,  r3, r4, 1
651    Read8x8 v5,  r3, r4, 1
652    Read8x8 v6,  r3, r4, 1
653    Read8x8 v7,  r3, r4, 1
654    Read8x8 v8,  r3, r4, 1
655    Read8x8 v9,  r3, r4, 1
656    Read8x8 v10, r3, r4, 1
657    Read8x8 v11, r3, r4, 1
658    Read8x8 v12, r3, r4, 0
659
660    slwi    r6, r6, 4           ;# index into vertical filter array
661
662second_pass_8x8:
663    load_c v13, VFilter, r6, r9, r10
664
665    vspltish v15, 8
666    vspltish v20, 3
667    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
668
669    vspltb  v14, v13, 1
670    vspltb  v15, v13, 2
671    vspltb  v16, v13, 3
672    vspltb  v17, v13, 4
673    vspltb  v18, v13, 5
674    vspltb  v13, v13, 0
675
676    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
677    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
678    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
679    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
680    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
681    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
682    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
683    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
684
685    cmpi    cr0, r8, 8
686    beq     cr0, store_aligned_8x8
687
688    w_8x8   v0, r7, r0, r8
689    w_8x8   v1, r7, r0, r8
690    w_8x8   v2, r7, r0, r8
691    w_8x8   v3, r7, r0, r8
692    w_8x8   v4, r7, r0, r8
693    w_8x8   v5, r7, r0, r8
694    w_8x8   v6, r7, r0, r8
695    w_8x8   v7, r7, r0, r8
696
697    b       exit_8x8
698
699store_aligned_8x8:
700
701    load_c v10, b_hilo, 0, r9, r10
702
703    vperm   v0, v0, v1, v10
704    vperm   v2, v2, v3, v10
705    vperm   v4, v4, v5, v10
706    vperm   v6, v6, v7, v10
707
708    stvx    v0, 0, r7
709    addi    r7, r7, 16
710    stvx    v2, 0, r7
711    addi    r7, r7, 16
712    stvx    v4, 0, r7
713    addi    r7, r7, 16
714    stvx    v6, 0, r7
715
716    b       exit_8x8
717
718store_8x8:
719    cmpi    cr0, r8, 8
720    beq     cr0, store_aligned2_8x8
721
722    w_8x8   v2, r7, r0, r8
723    w_8x8   v3, r7, r0, r8
724    w_8x8   v4, r7, r0, r8
725    w_8x8   v5, r7, r0, r8
726    w_8x8   v6, r7, r0, r8
727    w_8x8   v7, r7, r0, r8
728    w_8x8   v8, r7, r0, r8
729    w_8x8   v9, r7, r0, r8
730
731    b       exit_8x8
732
733store_aligned2_8x8:
734    load_c v10, b_hilo, 0, r9, r10
735
736    vperm   v2, v2, v3, v10
737    vperm   v4, v4, v5, v10
738    vperm   v6, v6, v7, v10
739    vperm   v8, v8, v9, v10
740
741    stvx    v2, 0, r7
742    addi    r7, r7, 16
743    stvx    v4, 0, r7
744    addi    r7, r7, 16
745    stvx    v6, 0, r7
746    addi    r7, r7, 16
747    stvx    v8, 0, r7
748
749exit_8x8:
750
751    addi    r1, r1, 32          ;# recover stack
752
753    mtspr   256, r11            ;# reset old VRSAVE
754
755    blr
756
757    .align 2
758;# r3 unsigned char * src
759;# r4 int src_pitch
760;# r5 int x_offset
761;# r6 int y_offset
762;# r7 unsigned char * dst
763;# r8 int dst_pitch
764
765;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
766;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
767;#  temporary buffer because the source buffer can't be modified and the buffer
768;#  for the destination is not large enough to hold the temporary data.
769sixtap_predict16x16_ppc:
770    mfspr   r11, 256            ;# get old VRSAVE
771    oris    r12, r11, 0xffff
772    ori     r12, r12, 0xf000
773    mtspr   256, r12            ;# set VRSAVE
774
775    stwu    r1,-416(r1)         ;# create space on the stack
776
777    ;# Three possiblities
778    ;#  1. First filter is null.  Don't use a temp buffer.
779    ;#  2. Second filter is null.  Don't use a temp buffer.
780    ;#  3. Neither are null, use temp buffer.
781
782    ;# First Pass (horizontal edge)
783    ;#  setup pointers for src
784    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
785    ;#  to second pass.  this is based on if x_offset is 0.
786
787    ;# load up horizontal filter
788    slwi.   r5, r5, 5           ;# index into horizontal filter array
789
790    load_hfilter v4, v5
791
792    beq-    copy_horizontal_16x21
793
794    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
795    addi    r3, r3, -2
796
797    slwi.   r6, r6, 4           ;# index into vertical filter array
798
799    ;# setup constants
800    ;# v14 permutation value for alignment
801    load_c v14, b_hperm, 0, r9, r10
802
803    ;# These statements are guessing that there won't be a second pass,
804    ;#  but if there is then inside the bypass they need to be set
805    li      r0, 16              ;# prepare for no vertical filter
806
807    ;# Change the output pointer and pitch to be the actual
808    ;#  desination instead of a temporary buffer.
809    addi    r9, r7, 0
810    addi    r5, r8, 0
811
812    ;# no vertical filter, so write the output from the first pass
813    ;#  directly into the output buffer.
814    beq-    no_vertical_filter_bypass
815
816    ;# if the second filter is not null then need to back off by 2*pitch
817    sub     r3, r3, r4
818    sub     r3, r3, r4
819
820    ;# setup counter for the number of lines that are going to be filtered
821    li      r0, 21
822
823    ;# use the stack as temporary storage
824    la      r9, 48(r1)
825    li      r5, 16
826
827no_vertical_filter_bypass:
828
829    mtctr   r0
830
831    ;# rounding added in on the multiply
832    vspltisw v10, 8
833    vspltisw v12, 3
834    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
835
836    ;# downshift by 7 ( divide by 128 ) at the end
837    vspltish v13, 7
838
839    ;# index to the next set of vectors in the row.
840    li      r10, 16
841    li      r12, 32
842
843horizontal_loop_16x16:
844
845    lvsl    v15,  0, r3         ;# permutate value for alignment
846
847    ;# input to filter is 21 bytes wide, output is 16 bytes.
848    ;#  input will can span three vectors if not aligned correctly.
849    lvx     v1,   0, r3
850    lvx     v2, r10, r3
851    lvx     v3, r12, r3
852
853    vperm   v8, v1, v2, v15
854    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
855
856    vsldoi  v11, v8, v9, 4
857
858    ;# set 0
859    vmsummbm v6, v4, v8, v12    ;# taps times elements
860    vmsummbm v0, v5, v11, v6
861
862    ;# set 1
863    vsldoi  v10, v8, v9, 1
864    vsldoi  v11, v8, v9, 5
865
866    vmsummbm v6, v4, v10, v12
867    vmsummbm v1, v5, v11, v6
868
869    ;# set 2
870    vsldoi  v10, v8, v9, 2
871    vsldoi  v11, v8, v9, 6
872
873    vmsummbm v6, v4, v10, v12
874    vmsummbm v2, v5, v11, v6
875
876    ;# set 3
877    vsldoi  v10, v8, v9, 3
878    vsldoi  v11, v8, v9, 7
879
880    vmsummbm v6, v4, v10, v12
881    vmsummbm v3, v5, v11, v6
882
883    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
884    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
885
886    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
887    vsrh    v1, v1, v13
888
889    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
890    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
891
892    stvx    v0,  0, r9
893    add     r9, r9, r5
894
895    add     r3, r3, r4
896
897    bdnz    horizontal_loop_16x16
898
899    ;# check again to see if vertical filter needs to be done.
900    cmpi    cr0, r6, 0
901    beq     cr0, end_16x16
902
903    ;# yes there is, so go to the second pass
904    b       second_pass_16x16
905
906copy_horizontal_16x21:
907    li      r10, 21
908    mtctr   r10
909
910    li      r10, 16
911
912    sub     r3, r3, r4
913    sub     r3, r3, r4
914
915    ;# this is done above if there is a horizontal filter,
916    ;#  if not it needs to be done down here.
917    slwi    r6, r6, 4           ;# index into vertical filter array
918
919    ;# always write to the stack when doing a horizontal copy
920    la      r9, 48(r1)
921
922copy_horizontal_loop_16x21:
923    lvsl    v15,  0, r3         ;# permutate value for alignment
924
925    lvx     v1,   0, r3
926    lvx     v2, r10, r3
927
928    vperm   v8, v1, v2, v15
929
930    stvx    v8,  0, r9
931    addi    r9, r9, 16
932
933    add     r3, r3, r4
934
935    bdnz    copy_horizontal_loop_16x21
936
937second_pass_16x16:
938
939    ;# always read from the stack when doing a vertical filter
940    la      r9, 48(r1)
941
942    ;# downshift by 7 ( divide by 128 ) at the end
943    vspltish v7, 7
944
945    vpre_load
946
947    luma_vsix
948    luma_vsix
949    luma_vfour
950
951end_16x16:
952
953    addi    r1, r1, 416         ;# recover stack
954
955    mtspr   256, r11            ;# reset old VRSAVE
956
957    blr
958
959    .data
960
961    .align 4
962HFilter:
963    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
964    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
965    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
966    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
967    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
968    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
969    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
970    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
971    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
972    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
973    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
974    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
975    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
976    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
977    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
978    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
979
980    .align 4
981VFilter:
982    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
983    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
984    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
985    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
986    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
987    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
988    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
989    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
990
991    .align 4
992b_hperm:
993    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
994
995    .align 4
996B_0123:
997    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
998
999    .align 4
1000B_4567:
1001    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
1002
1003    .align 4
1004B_89AB:
1005    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
1006
1007    .align 4
1008b_hilo:
1009    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
1010
1011    .align 4
1012b_hilo_4x4:
1013    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0
1014