1;
2;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12    EXPORT  |vp8_variance16x16_armv6|
13
14    ARM
15    REQUIRE8
16    PRESERVE8
17
18    AREA ||.text||, CODE, READONLY, ALIGN=2
19
20; r0    unsigned char *src_ptr
21; r1    int source_stride
22; r2    unsigned char *ref_ptr
23; r3    int  recon_stride
24; stack unsigned int *sse
25|vp8_variance16x16_armv6| PROC
26
27    stmfd   sp!, {r4-r12, lr}
28
29    pld     [r0, r1, lsl #0]
30    pld     [r2, r3, lsl #0]
31
32    mov     r8, #0              ; initialize sum = 0
33    mov     r11, #0             ; initialize sse = 0
34    mov     r12, #16            ; set loop counter to 16 (=block height)
35
36loop
37    ; 1st 4 pixels
38    ldr     r4, [r0, #0]        ; load 4 src pixels
39    ldr     r5, [r2, #0]        ; load 4 ref pixels
40
41    mov     lr, #0              ; constant zero
42
43    usub8   r6, r4, r5          ; calculate difference
44    pld     [r0, r1, lsl #1]
45    sel     r7, r6, lr          ; select bytes with positive difference
46    usub8   r9, r5, r4          ; calculate difference with reversed operands
47    pld     [r2, r3, lsl #1]
48    sel     r6, r9, lr          ; select bytes with negative difference
49
50    ; calculate partial sums
51    usad8   r4, r7, lr          ; calculate sum of positive differences
52    usad8   r5, r6, lr          ; calculate sum of negative differences
53    orr     r6, r6, r7          ; differences of all 4 pixels
54    ; calculate total sum
55    adds    r8, r8, r4          ; add positive differences to sum
56    subs    r8, r8, r5          ; subtract negative differences from sum
57
58    ; calculate sse
59    uxtb16  r5, r6              ; byte (two pixels) to halfwords
60    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
61    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
62
63    ; 2nd 4 pixels
64    ldr     r4, [r0, #4]        ; load 4 src pixels
65    ldr     r5, [r2, #4]        ; load 4 ref pixels
66    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
67
68    usub8   r6, r4, r5          ; calculate difference
69    sel     r7, r6, lr          ; select bytes with positive difference
70    usub8   r9, r5, r4          ; calculate difference with reversed operands
71    sel     r6, r9, lr          ; select bytes with negative difference
72
73    ; calculate partial sums
74    usad8   r4, r7, lr          ; calculate sum of positive differences
75    usad8   r5, r6, lr          ; calculate sum of negative differences
76    orr     r6, r6, r7          ; differences of all 4 pixels
77
78    ; calculate total sum
79    add     r8, r8, r4          ; add positive differences to sum
80    sub     r8, r8, r5          ; subtract negative differences from sum
81
82    ; calculate sse
83    uxtb16  r5, r6              ; byte (two pixels) to halfwords
84    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
85    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
86
87    ; 3rd 4 pixels
88    ldr     r4, [r0, #8]        ; load 4 src pixels
89    ldr     r5, [r2, #8]        ; load 4 ref pixels
90    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
91
92    usub8   r6, r4, r5          ; calculate difference
93    sel     r7, r6, lr          ; select bytes with positive difference
94    usub8   r9, r5, r4          ; calculate difference with reversed operands
95    sel     r6, r9, lr          ; select bytes with negative difference
96
97    ; calculate partial sums
98    usad8   r4, r7, lr          ; calculate sum of positive differences
99    usad8   r5, r6, lr          ; calculate sum of negative differences
100    orr     r6, r6, r7          ; differences of all 4 pixels
101
102    ; calculate total sum
103    add     r8, r8, r4          ; add positive differences to sum
104    sub     r8, r8, r5          ; subtract negative differences from sum
105
106    ; calculate sse
107    uxtb16  r5, r6              ; byte (two pixels) to halfwords
108    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
109    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
110
111    ; 4th 4 pixels
112    ldr     r4, [r0, #12]       ; load 4 src pixels
113    ldr     r5, [r2, #12]       ; load 4 ref pixels
114    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
115
116    usub8   r6, r4, r5          ; calculate difference
117    add     r0, r0, r1          ; set src_ptr to next row
118    sel     r7, r6, lr          ; select bytes with positive difference
119    usub8   r9, r5, r4          ; calculate difference with reversed operands
120    add     r2, r2, r3          ; set dst_ptr to next row
121    sel     r6, r9, lr          ; select bytes with negative difference
122
123    ; calculate partial sums
124    usad8   r4, r7, lr          ; calculate sum of positive differences
125    usad8   r5, r6, lr          ; calculate sum of negative differences
126    orr     r6, r6, r7          ; differences of all 4 pixels
127
128    ; calculate total sum
129    add     r8, r8, r4          ; add positive differences to sum
130    sub     r8, r8, r5          ; subtract negative differences from sum
131
132    ; calculate sse
133    uxtb16  r5, r6              ; byte (two pixels) to halfwords
134    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
135    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
136    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
137
138
139    subs    r12, r12, #1
140
141    bne     loop
142
143    ; return stuff
144    ldr     r6, [sp, #40]       ; get address of sse
145    mul     r0, r8, r8          ; sum * sum
146    str     r11, [r6]           ; store sse
147    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
148
149    ldmfd   sp!, {r4-r12, pc}
150
151    ENDP
152
153    END
154
155