190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
2f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
4f71323e297a928af368937089d3ed71239786f86Andreas Huber;  Use of this source code is governed by a BSD-style license
5f71323e297a928af368937089d3ed71239786f86Andreas Huber;  that can be found in the LICENSE file in the root of the source
6f71323e297a928af368937089d3ed71239786f86Andreas Huber;  tree. An additional intellectual property rights grant can be found
7f71323e297a928af368937089d3ed71239786f86Andreas Huber;  in the file PATENTS.  All contributing project authors may
8f71323e297a928af368937089d3ed71239786f86Andreas Huber;  be found in the AUTHORS file in the root of the source tree.
990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;
1090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .globl vp8_sub_pixel_variance4x4_ppc
1390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .globl vp8_sub_pixel_variance8x8_ppc
1490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .globl vp8_sub_pixel_variance8x16_ppc
1590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .globl vp8_sub_pixel_variance16x8_ppc
1690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .globl vp8_sub_pixel_variance16x16_ppc
1790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
1890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro load_c V, LABEL, OFF, R0, R1
1990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lis     \R0, \LABEL@ha
2090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    la      \R1, \LABEL@l(\R0)
2190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     \V, \OFF, \R1
2290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
2390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro load_vfilter V0, V1
2590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c \V0, vfilter_b, r6, r12, r10
2690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
2790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    addi    r6,  r6, 16
2890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     \V1, r6, r10
2990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
3090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro HProlog jump_label
3290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# load up horizontal filter
3390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    slwi.   r5, r5, 4           ;# index into horizontal filter array
3490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# index to the next set of vectors in the row.
3690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    li      r10, 16
3790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
3890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# downshift by 7 ( divide by 128 ) at the end
3990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v19, 7
4090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# If there isn't any filtering to be done for the horizontal, then
4290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  just skip to the second pass.
4390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     \jump_label
4490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v20, hfilter_b, r5, r12, r0
4690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
4790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# setup constants
4890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# v14 permutation value for alignment
4990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v28, b_hperm_b, 0, r12, r0
5090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# index to the next set of vectors in the row.
5290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    li      r12, 32
5390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# rounding added in on the multiply
5590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltisw v21, 8
5690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltisw v18, 3
5790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
5890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
5990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    slwi.   r6, r6, 5           ;# index into vertical filter array
6090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
6190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
6290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# Filters a horizontal line
6390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# expects:
6490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  r3  src_ptr
6590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  r4  pitch
6690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  r10 16
6790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  r12 32
6890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v17 perm intput
6990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v18 rounding
7090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v19 shift
7190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v20 filter taps
7290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v21 tmp
7390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v22 tmp
7490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v23 tmp
7590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v24 tmp
7690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v25 tmp
7790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v26 tmp
7890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v27 tmp
7990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v28 perm output
8090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#
8190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro hfilter_8 V, hp, lp, increment_counter
8390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvsl    v17,  0, r3         ;# permutate value for alignment
8490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# input to filter is 9 bytes wide, output is 8 bytes.
8690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     v21,   0, r3
8790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     v22, r10, r3
8890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
8990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.if \increment_counter
9090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add     r3, r3, r4
9190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endif
9290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   v21, v21, v22, v17
9390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
9590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
9690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
9790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmsummbm v24, v20, v24, v18
9890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmsummbm v25, v20, v25, v18
9990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
10190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
10390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
10590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
10690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
10790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro vfilter_16 P0 P1
10890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
10990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadduhm v22, v18, v22
11090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmuloub v23, \P0, v20
11190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadduhm v23, v18, v23
11290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmuleub v24, \P1, v21
11490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
11590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmuloub v25, \P1, v21
11690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vadduhm v23, v23, v25       ;# Ro = odds
11790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
11890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsrh    v22, v22, v19       ;# divide by 128
11990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
12090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
12190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrglh  v23, v22, v23
12290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
12390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
12490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
12590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
12690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Compute sum first.  Unpack to so signed subract
12790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  can be used.  Only have a half word signed
12890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  subract.  Do high, then low.
12990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  \t1, \z0, \src
13090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  \t2, \z0, \ref
13190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubshs \t1, \t1, \t2
13290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsum4shs \sum, \t1, \sum
13390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrglb  \t1, \z0, \src
13590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrglb  \t2, \z0, \ref
13690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsubshs \t1, \t1, \t2
13790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsum4shs \sum, \t1, \sum
13890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
13990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Now compute sse.
14090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsububs \t1, \src, \ref
14190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsububs \t2, \ref, \src
14290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vor     \t1, \t1, \t2
14390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmsumubm \sse, \t1, \t1, \sse
14590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
14690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
14790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro variance_final sum, sse, z0, DS
14890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsumsws \sum, \sum, \z0
14990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsumsws \sse, \sse, \z0
15090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    stvx    \sum, 0, r1
15290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lwz     r3, 12(r1)
15390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    stvx    \sse, 0, r1
15590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lwz     r4, 12(r1)
15690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    stw     r4, 0(r9)           ;# sse
15890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
15990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mullw   r3, r3, r3          ;# sum*sum
1601b362b15af34006e6a11974088a46d42b903418eJohann    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
16190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
16290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
16390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro compute_sum_sse_16 V, increment_counter
16590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v16, r7, r8, \increment_counter
16690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
16790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
16890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
16990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro load_and_align_16 V, R, P, increment_counter
17090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvsl    v17,  0, \R         ;# permutate value for alignment
17190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# input to filter is 21 bytes wide, output is 16 bytes.
17390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  input will can span three vectors if not aligned correctly.
17490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     v21,   0, \R
17590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     v22, r10, \R
17690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
17790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.if \increment_counter
17890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add     \R, \R, \P
17990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endif
18090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   \V, v21, v22, v17
18290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
18390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
18490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 2
18590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 unsigned char  *src_ptr
18690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r4 int  src_pixels_per_line
18790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r5 int  xoffset
18890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r6 int  yoffset
18990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r7 unsigned char *dst_ptr
19090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r8 int dst_pixels_per_line
19190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r9 unsigned int *sse
19290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#
19390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 return value
19490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8_sub_pixel_variance4x4_ppc:
19590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mfspr   r11, 256            ;# get old VRSAVE
19690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    oris    r12, r11, 0xf830
19790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ori     r12, r12, 0xfff8
19890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r12            ;# set VRSAVE
19990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    stwu    r1,-32(r1)          ;# create space on the stack
20190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    HProlog second_pass_4x4_pre_copy_b
20390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Load up permutation constants
20590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v10, b_0123_b, 0, r12, r0
20690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v11, b_4567_b, 0, r12, r0
20790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
20890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v0, v10, v11, 1
20990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v1, v10, v11, 1
21090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v2, v10, v11, 1
21190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v3, v10, v11, 1
21290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Finished filtering main horizontal block.  If there is no
21490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  vertical filtering, jump to storing the data.  Otherwise
21590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  load up and filter the additional line that is needed
21690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  for the vertical filter.
21790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_4x4_b
21890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
21990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v4, v10, v11, 0
22090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    b   second_pass_4x4_b
22290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_4x4_pre_copy_b:
22490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    slwi    r6, r6, 5           ;# index into vertical filter array
22590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
22690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v0, r3, r4, 1
22790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v1, r3, r4, 1
22890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v2, r3, r4, 1
22990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v3, r3, r4, 1
23090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v4, r3, r4, 0
23190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_4x4_b:
23390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v20, 8
23490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 3
23590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
23690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_vfilter v20, v21
23890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
23990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v0,  v1
24090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v1,  v2
24190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v2,  v3
24290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v3,  v4
24390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
24490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubercompute_sum_sse_4x4_b:
24590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 0             ;# sum
24690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v19, 0             ;# sse
24790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v23, 0             ;# unpack
24890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    li      r10, 16
24990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v4, r7, r8, 1
25190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v5, r7, r8, 1
25290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v6, r7, r8, 1
25390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v7, r7, r8, 1
25490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v0, v0, v1
25690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v1, v2, v3
25790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
25890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v2, v4, v5
25990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v3, v6, v7
26090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v10, b_hilo_b, 0, r12, r0
26290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   v0, v0, v1, v10
26490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   v1, v2, v3, v10
26590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
26790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
26890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    variance_final v18, v19, v23, 4
26990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    addi    r1, r1, 32          ;# recover stack
27190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r11            ;# reset old VRSAVE
27290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    blr
27490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
27590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 2
27690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 unsigned char  *src_ptr
27790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r4 int  src_pixels_per_line
27890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r5 int  xoffset
27990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r6 int  yoffset
28090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r7 unsigned char *dst_ptr
28190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r8 int dst_pixels_per_line
28290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r9 unsigned int *sse
28390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#
28490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 return value
28590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8_sub_pixel_variance8x8_ppc:
28690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mfspr   r11, 256            ;# get old VRSAVE
28790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    oris    r12, r11, 0xfff0
28890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ori     r12, r12, 0xffff
28990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r12            ;# set VRSAVE
29090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    stwu    r1,-32(r1)          ;# create space on the stack
29290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    HProlog second_pass_8x8_pre_copy_b
29490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Load up permutation constants
29690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v10, b_0123_b, 0, r12, r0
29790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v11, b_4567_b, 0, r12, r0
29890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
29990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v0, v10, v11, 1
30090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v1, v10, v11, 1
30190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v2, v10, v11, 1
30290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v3, v10, v11, 1
30390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v4, v10, v11, 1
30490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v5, v10, v11, 1
30590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v6, v10, v11, 1
30690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v7, v10, v11, 1
30790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
30890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Finished filtering main horizontal block.  If there is no
30990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  vertical filtering, jump to storing the data.  Otherwise
31090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  load up and filter the additional line that is needed
31190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  for the vertical filter.
31290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_8x8_b
31390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v8, v10, v11, 0
31590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    b   second_pass_8x8_b
31790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
31890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_8x8_pre_copy_b:
31990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    slwi.   r6, r6, 5           ;# index into vertical filter array
32090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
32190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v0, r3, r4, 1
32290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v1, r3, r4, 1
32390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v2, r3, r4, 1
32490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v3, r3, r4, 1
32590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v4, r3, r4, 1
32690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v5, r3, r4, 1
32790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v6, r3, r4, 1
32890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v7, r3, r4, 1
32990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v8, r3, r4, 0
33090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_8x8_b
33290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_8x8_b:
33490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v20, 8
33590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 3
33690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
33790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
33890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_vfilter v20, v21
33990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v0, v1
34190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v1, v2
34290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v2, v3
34390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v3, v4
34490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v4, v5
34590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v5, v6
34690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v6, v7
34790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v7, v8
34890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
34990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubercompute_sum_sse_8x8_b:
35090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 0             ;# sum
35190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v19, 0             ;# sse
35290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v23, 0             ;# unpack
35390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    li      r10, 16
35490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
35590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v0, v0, v1
35690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v1, v2, v3
35790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v2, v4, v5
35890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v3, v6, v7
35990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v4,  r7, r8, 1
36190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v5,  r7, r8, 1
36290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v6,  r7, r8, 1
36390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v7,  r7, r8, 1
36490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v8,  r7, r8, 1
36590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v9,  r7, r8, 1
36690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v10, r7, r8, 1
36790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v11, r7, r8, 0
36890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
36990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v4, v4,  v5
37090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v5, v6,  v7
37190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v6, v8,  v9
37290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v7, v10, v11
37390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
37590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
37690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
37790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
37890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
37990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    variance_final v18, v19, v23, 6
38090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    addi    r1, r1, 32          ;# recover stack
38290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r11            ;# reset old VRSAVE
38390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    blr
38490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
38590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 2
38690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 unsigned char  *src_ptr
38790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r4 int  src_pixels_per_line
38890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r5 int  xoffset
38990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r6 int  yoffset
39090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r7 unsigned char *dst_ptr
39190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r8 int dst_pixels_per_line
39290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r9 unsigned int *sse
39390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#
39490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 return value
39590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8_sub_pixel_variance8x16_ppc:
39690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mfspr   r11, 256            ;# get old VRSAVE
39790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    oris    r12, r11, 0xffff
39890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ori     r12, r12, 0xfffc
39990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r12            ;# set VRSAVE
40090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    stwu    r1,-32(r1)          ;# create space on the stack
40290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    HProlog second_pass_8x16_pre_copy_b
40490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Load up permutation constants
40690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v29, b_0123_b, 0, r12, r0
40790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_c v30, b_4567_b, 0, r12, r0
40890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
40990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v0,  v29, v30, 1
41090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v1,  v29, v30, 1
41190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v2,  v29, v30, 1
41290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v3,  v29, v30, 1
41390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v4,  v29, v30, 1
41490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v5,  v29, v30, 1
41590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v6,  v29, v30, 1
41690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v7,  v29, v30, 1
41790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v8,  v29, v30, 1
41890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v9,  v29, v30, 1
41990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v10, v29, v30, 1
42090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v11, v29, v30, 1
42190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v12, v29, v30, 1
42290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v13, v29, v30, 1
42390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v14, v29, v30, 1
42490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v15, v29, v30, 1
42590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
42690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Finished filtering main horizontal block.  If there is no
42790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  vertical filtering, jump to storing the data.  Otherwise
42890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  load up and filter the additional line that is needed
42990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  for the vertical filter.
43090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_8x16_b
43190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_8 v16, v29, v30, 0
43390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    b   second_pass_8x16_b
43590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_8x16_pre_copy_b:
43790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    slwi.   r6, r6, 5           ;# index into vertical filter array
43890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
43990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v0,  r3, r4, 1
44090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v1,  r3, r4, 1
44190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v2,  r3, r4, 1
44290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v3,  r3, r4, 1
44390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v4,  r3, r4, 1
44490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v5,  r3, r4, 1
44590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v6,  r3, r4, 1
44690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v7,  r3, r4, 1
44790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v8,  r3, r4, 1
44890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v9,  r3, r4, 1
44990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v10, r3, r4, 1
45090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v11, r3, r4, 1
45190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v12, r3, r4, 1
45290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v13, r3, r4, 1
45390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v14, r3, r4, 1
45490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v15, r3, r4, 1
45590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v16, r3, r4, 0
45690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_8x16_b
45890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
45990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_8x16_b:
46090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v20, 8
46190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 3
46290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
46390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_vfilter v20, v21
46590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
46690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v0,  v1
46790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v1,  v2
46890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v2,  v3
46990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v3,  v4
47090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v4,  v5
47190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v5,  v6
47290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v6,  v7
47390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v7,  v8
47490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v8,  v9
47590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v9,  v10
47690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v10, v11
47790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v11, v12
47890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v12, v13
47990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v13, v14
48090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v14, v15
48190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v15, v16
48290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubercompute_sum_sse_8x16_b:
48490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 0             ;# sum
48590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v19, 0             ;# sse
48690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v23, 0             ;# unpack
48790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    li      r10, 16
48890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
48990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v0, v0,  v1
49090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v1, v2,  v3
49190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v2, v4,  v5
49290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v3, v6,  v7
49390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v4, v8,  v9
49490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v5, v10, v11
49590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v6, v12, v13
49690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v7, v14, v15
49790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
49890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v8,  r7, r8, 1
49990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v9,  r7, r8, 1
50090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v10, r7, r8, 1
50190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v11, r7, r8, 1
50290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v12, r7, r8, 1
50390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v13, r7, r8, 1
50490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v14, r7, r8, 1
50590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v15, r7, r8, 1
50690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
50790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v8,  v8,  v9
50890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v9,  v10, v11
50990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v10, v12, v13
51090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v11, v14, v15
51190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
51390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
51490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
51590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
51690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
51790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v8,  r7, r8, 1
51890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v9,  r7, r8, 1
51990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v10, r7, r8, 1
52090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v11, r7, r8, 1
52190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v12, r7, r8, 1
52290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v13, r7, r8, 1
52390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v14, r7, r8, 1
52490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16 v15, r7, r8, 0
52590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
52690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v8,  v8,  v9
52790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v9,  v10, v11
52890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v10, v12, v13
52990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmrghb  v11, v14, v15
53090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
53290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
53390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
53490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
53590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    variance_final v18, v19, v23, 7
53790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
53890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    addi    r1, r1, 32          ;# recover stack
53990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r11            ;# reset old VRSAVE
54090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    blr
54190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
54290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# Filters a horizontal line
54390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# expects:
54490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  r3  src_ptr
54590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  r4  pitch
54690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  r10 16
54790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  r12 32
54890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v17 perm intput
54990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v18 rounding
55090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v19 shift
55190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v20 filter taps
55290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v21 tmp
55390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v22 tmp
55490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v23 tmp
55590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v24 tmp
55690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v25 tmp
55790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v26 tmp
55890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v27 tmp
55990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#  v28 perm output
56090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#
56190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.macro hfilter_16 V, increment_counter
56290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvsl    v17,  0, r3         ;# permutate value for alignment
56490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
56590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# input to filter is 21 bytes wide, output is 16 bytes.
56690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  input will can span three vectors if not aligned correctly.
56790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     v21,   0, r3
56890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     v22, r10, r3
56990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    lvx     v23, r12, r3
57090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.if \increment_counter
57290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    add     r3, r3, r4
57390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endif
57490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   v21, v21, v22, v17
57590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
57690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
57790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# set 0
57890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmsummbm v24, v20, v21, v18 ;# taps times elements
57990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# set 1
58190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsldoi  v23, v21, v22, 1
58290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmsummbm v25, v20, v23, v18
58390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# set 2
58590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsldoi  v23, v21, v22, 2
58690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmsummbm v26, v20, v23, v18
58790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
58890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# set 3
58990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsldoi  v23, v21, v22, 3
59090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vmsummbm v27, v20, v23, v18
59190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
59390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
59490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
59690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vsrh    v25, v25, v19
59790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
59890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
59990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
60090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber.endm
60190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
60290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 2
60390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 unsigned char  *src_ptr
60490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r4 int  src_pixels_per_line
60590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r5 int  xoffset
60690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r6 int  yoffset
60790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r7 unsigned char *dst_ptr
60890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r8 int dst_pixels_per_line
60990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r9 unsigned int *sse
61090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#
61190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 return value
61290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8_sub_pixel_variance16x8_ppc:
61390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mfspr   r11, 256            ;# get old VRSAVE
61490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    oris    r12, r11, 0xffff
61590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ori     r12, r12, 0xfff8
61690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r12            ;# set VRSAVE
61790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
61890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    stwu    r1, -32(r1)         ;# create space on the stack
61990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    HProlog second_pass_16x8_pre_copy_b
62190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
62290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v0, 1
62390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v1, 1
62490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v2, 1
62590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v3, 1
62690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v4, 1
62790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v5, 1
62890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v6, 1
62990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v7, 1
63090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Finished filtering main horizontal block.  If there is no
63290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  vertical filtering, jump to storing the data.  Otherwise
63390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  load up and filter the additional line that is needed
63490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  for the vertical filter.
63590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_16x8_b
63690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v8, 0
63890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
63990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    b   second_pass_16x8_b
64090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_16x8_pre_copy_b:
64290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    slwi.   r6, r6, 5           ;# index into vertical filter array
64390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
64490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v0,  r3, r4, 1
64590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v1,  r3, r4, 1
64690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v2,  r3, r4, 1
64790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v3,  r3, r4, 1
64890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v4,  r3, r4, 1
64990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v5,  r3, r4, 1
65090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v6,  r3, r4, 1
65190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v7,  r3, r4, 1
65290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v8,  r3, r4, 1
65390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_16x8_b
65590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
65690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_16x8_b:
65790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v20, 8
65890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 3
65990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
66090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_vfilter v20, v21
66290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
66390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v0,  v1
66490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v1,  v2
66590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v2,  v3
66690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v3,  v4
66790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v4,  v5
66890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v5,  v6
66990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v6,  v7
67090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v7,  v8
67190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
67290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubercompute_sum_sse_16x8_b:
67390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 0             ;# sum
67490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v19, 0             ;# sse
67590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v23, 0             ;# unpack
67690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    li      r10, 16
67790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
67890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v0, 1
67990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v1, 1
68090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v2, 1
68190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v3, 1
68290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v4, 1
68390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v5, 1
68490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v6, 1
68590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v7, 0
68690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    variance_final v18, v19, v23, 7
68890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
68990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    addi    r1, r1, 32          ;# recover stack
69090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r11            ;# reset old VRSAVE
69290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    blr
69490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
69590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 2
69690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 unsigned char  *src_ptr
69790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r4 int  src_pixels_per_line
69890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r5 int  xoffset
69990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r6 int  yoffset
70090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r7 unsigned char *dst_ptr
70190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r8 int dst_pixels_per_line
70290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r9 unsigned int *sse
70390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;#
70490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber;# r3 return value
70590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervp8_sub_pixel_variance16x16_ppc:
70690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mfspr   r11, 256            ;# get old VRSAVE
70790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    oris    r12, r11, 0xffff
70890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ori     r12, r12, 0xfff8
70990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r12            ;# set VRSAVE
71090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    stwu    r1, -32(r1)         ;# create space on the stack
71290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    HProlog second_pass_16x16_pre_copy_b
71490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
71590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v0,  1
71690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v1,  1
71790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v2,  1
71890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v3,  1
71990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v4,  1
72090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v5,  1
72190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v6,  1
72290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v7,  1
72390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v8,  1
72490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v9,  1
72590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v10, 1
72690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v11, 1
72790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v12, 1
72890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v13, 1
72990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v14, 1
73090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v15, 1
73190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;# Finished filtering main horizontal block.  If there is no
73390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  vertical filtering, jump to storing the data.  Otherwise
73490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  load up and filter the additional line that is needed
73590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    ;#  for the vertical filter.
73690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_16x16_b
73790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
73890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    hfilter_16 v16, 0
73990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    b   second_pass_16x16_b
74190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_16x16_pre_copy_b:
74390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    slwi.   r6, r6, 5           ;# index into vertical filter array
74490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
74590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v0,  r3, r4, 1
74690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v1,  r3, r4, 1
74790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v2,  r3, r4, 1
74890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v3,  r3, r4, 1
74990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v4,  r3, r4, 1
75090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v5,  r3, r4, 1
75190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v6,  r3, r4, 1
75290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v7,  r3, r4, 1
75390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v8,  r3, r4, 1
75490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v9,  r3, r4, 1
75590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v10, r3, r4, 1
75690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v11, r3, r4, 1
75790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v12, r3, r4, 1
75890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v13, r3, r4, 1
75990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v14, r3, r4, 1
76090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v15, r3, r4, 1
76190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_and_align_16  v16, r3, r4, 0
76290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    beq     compute_sum_sse_16x16_b
76490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
76590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubersecond_pass_16x16_b:
76690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v20, 8
76790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 3
76890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
76990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    load_vfilter v20, v21
77190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
77290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v0,  v1
77390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v1,  v2
77490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v2,  v3
77590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v3,  v4
77690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v4,  v5
77790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v5,  v6
77890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v6,  v7
77990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v7,  v8
78090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v8,  v9
78190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v9,  v10
78290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v10, v11
78390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v11, v12
78490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v12, v13
78590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v13, v14
78690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v14, v15
78790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vfilter_16 v15, v16
78890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
78990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubercompute_sum_sse_16x16_b:
79090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v18, 0             ;# sum
79190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v19, 0             ;# sse
79290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    vspltish v23, 0             ;# unpack
79390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    li      r10, 16
79490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
79590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v0,  1
79690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v1,  1
79790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v2,  1
79890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v3,  1
79990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v4,  1
80090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v5,  1
80190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v6,  1
80290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v7,  1
80390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v8,  1
80490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v9,  1
80590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v10, 1
80690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v11, 1
80790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v12, 1
80890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v13, 1
80990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v14, 1
81090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    compute_sum_sse_16 v15, 0
81190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    variance_final v18, v19, v23, 8
81390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    addi    r1, r1, 32          ;# recover stack
81590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    mtspr   256, r11            ;# reset old VRSAVE
81790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
81890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    blr
81990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .data
82190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
82290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 4
82390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberhfilter_b:
82490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
82590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
82690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
82790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
82890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
82990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
83090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
83190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
83290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
83390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 4
83490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Hubervfilter_b:
83590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
83690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
83790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
83890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
83990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
84090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
84190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
84290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
84390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
84490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
84590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
84690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
84790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
84890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
84990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
85090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
85190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
85290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 4
85390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberb_hperm_b:
85490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
85590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
85690d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 4
85790d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberb_0123_b:
85890d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
85990d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86090d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .align 4
86190d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberb_4567_b:
86290d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
86390d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber
86490d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huberb_hilo_b:
86590d3ed91ae9228e1c8bab561b6138d4cb8c1e4fdAndreas Huber    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
866