1;
2;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_variance16x16_armv6|
13
14    ARM
15    REQUIRE8
16    PRESERVE8
17
18    AREA ||.text||, CODE, READONLY, ALIGN=2
19
20; r0    unsigned char *src_ptr
21; r1    int source_stride
22; r2    unsigned char *ref_ptr
23; r3    int  recon_stride
24; stack unsigned int *sse
25|vp8_variance16x16_armv6| PROC
26
27    stmfd   sp!, {r4-r12, lr}
28    mov     r8, #0              ; initialize sum = 0
29    mov     r11, #0             ; initialize sse = 0
30    mov     r12, #16            ; set loop counter to 16 (=block height)
31
32loop
33    ; 1st 4 pixels
34    ldr     r4, [r0, #0]        ; load 4 src pixels
35    ldr     r5, [r2, #0]        ; load 4 ref pixels
36
37    mov     lr, #0              ; constant zero
38
39    usub8   r6, r4, r5          ; calculate difference
40    sel     r7, r6, lr          ; select bytes with positive difference
41    usub8   r9, r5, r4          ; calculate difference with reversed operands
42    sel     r6, r9, lr          ; select bytes with negative difference
43
44    ; calculate partial sums
45    usad8   r4, r7, lr          ; calculate sum of positive differences
46    usad8   r5, r6, lr          ; calculate sum of negative differences
47    orr     r6, r6, r7          ; differences of all 4 pixels
48    ; calculate total sum
49    adds    r8, r8, r4          ; add positive differences to sum
50    subs    r8, r8, r5          ; substract negative differences from sum
51
52    ; calculate sse
53    uxtb16  r5, r6              ; byte (two pixels) to halfwords
54    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
55    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
56
57    ; 2nd 4 pixels
58    ldr     r4, [r0, #4]        ; load 4 src pixels
59    ldr     r5, [r2, #4]        ; load 4 ref pixels
60    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
61
62    usub8   r6, r4, r5          ; calculate difference
63    sel     r7, r6, lr          ; select bytes with positive difference
64    usub8   r9, r5, r4          ; calculate difference with reversed operands
65    sel     r6, r9, lr          ; select bytes with negative difference
66
67    ; calculate partial sums
68    usad8   r4, r7, lr          ; calculate sum of positive differences
69    usad8   r5, r6, lr          ; calculate sum of negative differences
70    orr     r6, r6, r7          ; differences of all 4 pixels
71
72    ; calculate total sum
73    add     r8, r8, r4          ; add positive differences to sum
74    sub     r8, r8, r5          ; substract negative differences from sum
75
76    ; calculate sse
77    uxtb16  r5, r6              ; byte (two pixels) to halfwords
78    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
79    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
80
81    ; 3rd 4 pixels
82    ldr     r4, [r0, #8]        ; load 4 src pixels
83    ldr     r5, [r2, #8]        ; load 4 ref pixels
84    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
85
86    usub8   r6, r4, r5          ; calculate difference
87    sel     r7, r6, lr          ; select bytes with positive difference
88    usub8   r9, r5, r4          ; calculate difference with reversed operands
89    sel     r6, r9, lr          ; select bytes with negative difference
90
91    ; calculate partial sums
92    usad8   r4, r7, lr          ; calculate sum of positive differences
93    usad8   r5, r6, lr          ; calculate sum of negative differences
94    orr     r6, r6, r7          ; differences of all 4 pixels
95
96    ; calculate total sum
97    add     r8, r8, r4          ; add positive differences to sum
98    sub     r8, r8, r5          ; substract negative differences from sum
99
100    ; calculate sse
101    uxtb16  r5, r6              ; byte (two pixels) to halfwords
102    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
103    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
104
105    ; 4th 4 pixels
106    ldr     r4, [r0, #12]       ; load 4 src pixels
107    ldr     r5, [r2, #12]       ; load 4 ref pixels
108    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
109
110    usub8   r6, r4, r5          ; calculate difference
111    add     r0, r0, r1          ; set src_ptr to next row
112    sel     r7, r6, lr          ; select bytes with positive difference
113    usub8   r9, r5, r4          ; calculate difference with reversed operands
114    add     r2, r2, r3          ; set dst_ptr to next row
115    sel     r6, r9, lr          ; select bytes with negative difference
116
117    ; calculate partial sums
118    usad8   r4, r7, lr          ; calculate sum of positive differences
119    usad8   r5, r6, lr          ; calculate sum of negative differences
120    orr     r6, r6, r7          ; differences of all 4 pixels
121
122    ; calculate total sum
123    add     r8, r8, r4          ; add positive differences to sum
124    sub     r8, r8, r5          ; substract negative differences from sum
125
126    ; calculate sse
127    uxtb16  r5, r6              ; byte (two pixels) to halfwords
128    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
129    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
130    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
131
132
133    subs    r12, r12, #1
134
135    bne     loop
136
137    ; return stuff
138    ldr     r6, [sp, #40]       ; get address of sse
139    mul     r0, r8, r8          ; sum * sum
140    str     r11, [r6]           ; store sse
141    sub     r0, r11, r0, asr #8 ; return (sse - ((sum * sum) >> 8))
142
143    ldmfd   sp!, {r4-r12, pc}
144
145    ENDP
146
147    END
148
149