1/* libs/pixelflinger/t32cb16blend.S
2**
3** Copyright 2006, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9**     http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
18
19	.text
20	.align
21
22	.global scanline_t32cb16blend_arm
23
24
25/*
26 * .macro pixel
27 *
28 * \DREG is a 32-bit register containing *two* original destination RGB565
29 *       pixels, with the even one in the low-16 bits, and the odd one in the
30 *       high 16 bits.
31 *
32 * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
33 *
34 * \FB is a target register that will contain the blended pixel values.
35 *
36 * \ODD is either 0 or 1 and indicates if we're blending the lower or
37 *      upper 16-bit pixels in DREG into FB
38 *
39 *
40 * clobbered: r6, r7, lr
41 *
42 */
43
44.macro pixel,   DREG, SRC, FB, ODD
45
46    // SRC = 0xAABBGGRR
47    mov     r7, \SRC, lsr #24           // sA
48    add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
49    rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))
50
511:
52
53.if \ODD
54
55    // red
56    mov     lr, \DREG, lsr #(16 + 11)
57    smulbb  lr, r7, lr
58    mov     r6, \SRC, lsr #3
59    and     r6, r6, #0x1F
60    add     lr, r6, lr, lsr #8
61    cmp     lr, #0x1F
62    orrhs   \FB, \FB, #(0x1F<<(16 + 11))
63    orrlo   \FB, \FB, lr, lsl #(16 + 11)
64
65        // green
66        and     r6, \DREG, #(0x3F<<(16 + 5))
67        smulbt  r6, r7, r6
68        mov     lr, \SRC, lsr #(8+2)
69        and     lr, lr, #0x3F
70        add     r6, lr, r6, lsr #(5+8)
71        cmp     r6, #0x3F
72        orrhs   \FB, \FB, #(0x3F<<(16 + 5))
73        orrlo   \FB, \FB, r6, lsl #(16 + 5)
74
75            // blue
76            and     lr, \DREG, #(0x1F << 16)
77            smulbt  lr, r7, lr
78            mov     r6, \SRC, lsr #(8+8+3)
79            and     r6, r6, #0x1F
80            add     lr, r6, lr, lsr #8
81            cmp     lr, #0x1F
82            orrhs   \FB, \FB, #(0x1F << 16)
83            orrlo   \FB, \FB, lr, lsl #16
84
85.else
86
87    // red
88    mov     lr, \DREG, lsr #11
89    and     lr, lr, #0x1F
90    smulbb  lr, r7, lr
91    mov     r6, \SRC, lsr #3
92    and     r6, r6, #0x1F
93    add     lr, r6, lr, lsr #8
94    cmp     lr, #0x1F
95    movhs   \FB, #(0x1F<<11)
96    movlo   \FB, lr, lsl #11
97
98
99        // green
100        and     r6, \DREG, #(0x3F<<5)
101        smulbb  r6, r7, r6
102        mov     lr, \SRC, lsr #(8+2)
103        and     lr, lr, #0x3F
104        add     r6, lr, r6, lsr #(5+8)
105        cmp     r6, #0x3F
106        orrhs   \FB, \FB, #(0x3F<<5)
107        orrlo   \FB, \FB, r6, lsl #5
108
109            // blue
110            and     lr, \DREG, #0x1F
111            smulbb  lr, r7, lr
112            mov     r6, \SRC, lsr #(8+8+3)
113            and     r6, r6, #0x1F
114            add     lr, r6, lr, lsr #8
115            cmp     lr, #0x1F
116            orrhs   \FB, \FB, #0x1F
117            orrlo   \FB, \FB, lr
118
119.endif
120
121    .endm
122
123
124// r0:  dst ptr
125// r1:  src ptr
126// r2:  count
127// r3:  d
128// r4:  s0
129// r5:  s1
130// r6:  pixel
131// r7:  pixel
132// r8:  free
133// r9:  free
134// r10: free
135// r11: free
136// r12: scratch
137// r14: pixel
138
139scanline_t32cb16blend_arm:
140    stmfd	sp!, {r4-r7, lr}
141
142    pld     [r0]
143    pld     [r1]
144
145    // align DST to 32 bits
146    tst     r0, #0x3
147    beq     aligned
148    subs    r2, r2, #1
149    ldmlofd	sp!, {r4-r7, lr}        // return
150    bxlo    lr
151
152last:
153    ldr     r4, [r1], #4
154    ldrh    r3, [r0]
155    pixel   r3, r4, r12, 0
156    strh    r12, [r0], #2
157
158aligned:
159    subs    r2, r2, #2
160    blo     9f
161
162    // The main loop is unrolled twice and processes 4 pixels
1638:  ldmia   r1!, {r4, r5}
164    // stream the source
165    pld     [r1, #32]
166    add     r0, r0, #4
167    // it's all zero, skip this pixel
168    orrs    r3, r4, r5
169    beq     7f
170
171    // load the destination
172    ldr     r3, [r0, #-4]
173    // stream the destination
174    pld     [r0, #32]
175    pixel   r3, r4, r12, 0
176    pixel   r3, r5, r12, 1
177    // effectively, we're getting write-combining by virtue of the
178    // cpu's write-back cache.
179    str     r12, [r0, #-4]
180
181    // 2nd iterration of the loop, don't stream anything
182    subs    r2, r2, #2
183    movlt   r4, r5
184    blt     9f
185    ldmia   r1!, {r4, r5}
186    add     r0, r0, #4
187    orrs    r3, r4, r5
188    beq     7f
189    ldr     r3, [r0, #-4]
190    pixel   r3, r4, r12, 0
191    pixel   r3, r5, r12, 16
192    str     r12, [r0, #-4]
193
194
1957:  subs    r2, r2, #2
196    bhs     8b
197    mov     r4, r5
198
1999:  adds    r2, r2, #1
200    ldmlofd sp!, {r4-r7, lr}        // return
201    bxlo    lr
202    b       last
203