vp8_variance8x8_armv6.asm revision 79f15823c34ae1e423108295e416213200bb280f
1;
2;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_variance8x8_armv6|
13
14    ARM
15
16    AREA ||.text||, CODE, READONLY, ALIGN=2
17
18; r0    unsigned char *src_ptr
19; r1    int source_stride
20; r2    unsigned char *ref_ptr
21; r3    int  recon_stride
22; stack unsigned int *sse
23|vp8_variance8x8_armv6| PROC
24
25    push    {r4-r10, lr}
26    mov     r12, #8             ; set loop counter to 8 (=block height)
27    mov     r4, #0              ; initialize sum = 0
28    mov     r5, #0              ; initialize sse = 0
29
30loop
31    ; 1st 4 pixels
32    ldr     r6, [r0, #0x0]      ; load 4 src pixels
33    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
34
35    mov     lr, #0              ; constant zero
36
37    usub8   r8, r6, r7          ; calculate difference
38    sel     r10, r8, lr         ; select bytes with positive difference
39    usub8   r9, r7, r6          ; calculate difference with reversed operands
40    sel     r8, r9, lr          ; select bytes with negative difference
41
42    ; calculate partial sums
43    usad8   r6, r10, lr         ; calculate sum of positive differences
44    usad8   r7, r8, lr          ; calculate sum of negative differences
45    orr     r8, r8, r10         ; differences of all 4 pixels
46    ; calculate total sum
47    add    r4, r4, r6           ; add positive differences to sum
48    sub    r4, r4, r7           ; substract negative differences from sum
49
50    ; calculate sse
51    uxtb16  r7, r8              ; byte (two pixels) to halfwords
52    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
53    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
54
55    ; 2nd 4 pixels
56    ldr     r6, [r0, #0x4]      ; load 4 src pixels
57    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
58    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
59
60    usub8   r8, r6, r7          ; calculate difference
61    add     r0, r0, r1          ; set src_ptr to next row
62    sel     r10, r8, lr         ; select bytes with positive difference
63    usub8   r9, r7, r6          ; calculate difference with reversed operands
64    add     r2, r2, r3          ; set dst_ptr to next row
65    sel     r8, r9, lr          ; select bytes with negative difference
66
67    ; calculate partial sums
68    usad8   r6, r10, lr         ; calculate sum of positive differences
69    usad8   r7, r8, lr          ; calculate sum of negative differences
70    orr     r8, r8, r10         ; differences of all 4 pixels
71
72    ; calculate total sum
73    add     r4, r4, r6          ; add positive differences to sum
74    sub     r4, r4, r7          ; substract negative differences from sum
75
76    ; calculate sse
77    uxtb16  r7, r8              ; byte (two pixels) to halfwords
78    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
79    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
80    subs    r12, r12, #1        ; next row
81    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
82
83    bne     loop
84
85    ; return stuff
86    ldr     r8, [sp, #32]       ; get address of sse
87    mul     r1, r4, r4          ; sum * sum
88    str     r5, [r8]            ; store sse
89    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
90
91    pop     {r4-r10, pc}
92
93    ENDP
94
95    END
96