1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *  * Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 *  * Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in
12 *    the documentation and/or other materials provided with the
13 *    distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28    .text
29    .balign 0
30
31    .global scanline_t32cb16blend_arm64
32
33/*
34 * .macro pixel
35 *
36 *  This macro alpha blends RGB565 original pixel located in either
37 *  top or bottom 16 bits of DREG register with SRC 32 bit pixel value
38 *  and writes the result to FB register
39 *
40 * \DREG is a 32-bit register containing *two* original destination RGB565
41 *       pixels, with the even one in the low-16 bits, and the odd one in the
42 *       high 16 bits.
43 *
44 * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
45 *
46 * \FB is a target register that will contain the blended pixel values.
47 *
48 * \ODD is either 0 or 1 and indicates if we're blending the lower or
49 *      upper 16-bit pixels in DREG into FB
50 *
51 *
52 * clobbered: w6, w7, w16, w17, w18
53 *
54 */
55
56.macro pixel,   DREG, SRC, FB, ODD
57
58    // SRC = 0xAABBGGRR
59    lsr     w7, \SRC, #24               // sA
60    add     w7, w7, w7, lsr #7          // sA + (sA >> 7)
61    mov     w6, #0x100
62    sub     w7, w6, w7                  // sA = 0x100 - (sA+(sA>>7))
63
641:
65
66.if \ODD //Blending odd pixel present in top 16 bits of DREG register
67
68    // red
69    lsr     w16, \DREG, #(16 + 11)
70    mul     w16, w7, w16
71    lsr     w6, \SRC, #3
72    and     w6, w6, #0x1F
73    add     w16, w6, w16, lsr #8
74    cmp     w16, #0x1F
75    orr     w17, \FB, #(0x1F<<(16 + 11))
76    orr     w18, \FB, w16, lsl #(16 + 11)
77    csel    \FB, w17, w18, hi
78        // green
79        and     w6, \DREG, #(0x3F<<(16 + 5))
80        lsr     w17,w6,#(16+5)
81        mul     w6, w7, w17
82        lsr     w16, \SRC, #(8+2)
83        and     w16, w16, #0x3F
84        add     w6, w16, w6, lsr #8
85        cmp     w6, #0x3F
86        orr     w17, \FB, #(0x3F<<(16 + 5))
87        orr     w18, \FB, w6, lsl #(16 + 5)
88        csel    \FB, w17, w18, hi
89            // blue
90            and     w16, \DREG, #(0x1F << 16)
91            lsr     w17,w16,#16
92            mul     w16, w7, w17
93            lsr     w6, \SRC, #(8+8+3)
94            and     w6, w6, #0x1F
95            add     w16, w6, w16, lsr #8
96            cmp     w16, #0x1F
97            orr     w17, \FB, #(0x1F << 16)
98            orr     w18, \FB, w16, lsl #16
99            csel    \FB, w17, w18, hi
100
101.else //Blending even pixel present in bottom 16 bits of DREG register
102
103    // red
104    lsr     w16, \DREG, #11
105    and     w16, w16, #0x1F
106    mul     w16, w7, w16
107    lsr     w6, \SRC, #3
108    and     w6, w6, #0x1F
109    add     w16, w6, w16, lsr #8
110    cmp     w16, #0x1F
111    mov     w17, #(0x1F<<11)
112    lsl     w18, w16, #11
113    csel    \FB, w17, w18, hi
114
115
116        // green
117        and     w6, \DREG, #(0x3F<<5)
118        mul     w6, w7, w6
119        lsr     w16, \SRC, #(8+2)
120        and     w16, w16, #0x3F
121        add     w6, w16, w6, lsr #(5+8)
122        cmp     w6, #0x3F
123        orr     w17, \FB, #(0x3F<<5)
124        orr     w18, \FB, w6, lsl #5
125        csel    \FB, w17, w18, hi
126
127            // blue
128            and     w16, \DREG, #0x1F
129            mul     w16, w7, w16
130            lsr     w6, \SRC, #(8+8+3)
131            and     w6, w6, #0x1F
132            add     w16, w6, w16, lsr #8
133            cmp     w16, #0x1F
134            orr     w17, \FB, #0x1F
135            orr     w18, \FB, w16
136            csel    \FB, w17, w18, hi
137
138.endif // End of blending even pixel
139
140.endm // End of pixel macro
141
142
143// x0:  dst ptr
144// x1:  src ptr
145// w2:  count
146// w3:  d
147// w4:  s0
148// w5:  s1
149// w6:  pixel
150// w7:  pixel
151// w8:  free
152// w9:  free
153// w10: free
154// w11: free
155// w12: scratch
156// w14: pixel
157
158scanline_t32cb16blend_arm64:
159
160    // align DST to 32 bits
161    tst     x0, #0x3
162    b.eq    aligned
163    subs    w2, w2, #1
164    b.lo    return
165
166last:
167    ldr     w4, [x1], #4
168    ldrh    w3, [x0]
169    pixel   w3, w4, w12, 0
170    strh    w12, [x0], #2
171
172aligned:
173    subs    w2, w2, #2
174    b.lo    9f
175
176    // The main loop is unrolled twice and processes 4 pixels
1778:
178    ldp   w4,w5, [x1], #8
179    add     x0, x0, #4
180    // it's all zero, skip this pixel
181    orr     w3, w4, w5
182    cbz     w3, 7f
183
184    // load the destination
185    ldr     w3, [x0, #-4]
186    // stream the destination
187    pixel   w3, w4, w12, 0
188    pixel   w3, w5, w12, 1
189    str     w12, [x0, #-4]
190
191    // 2nd iteration of the loop, don't stream anything
192    subs    w2, w2, #2
193    csel    w4, w5, w4, lt
194    blt     9f
195    ldp     w4,w5, [x1], #8
196    add     x0, x0, #4
197    orr     w3, w4, w5
198    cbz     w3, 7f
199    ldr     w3, [x0, #-4]
200    pixel   w3, w4, w12, 0
201    pixel   w3, w5, w12, 1
202    str     w12, [x0, #-4]
203
2047:  subs    w2, w2, #2
205    bhs     8b
206    mov     w4, w5
207
2089:  adds    w2, w2, #1
209    b.lo    return
210    b       last
211
212return:
213    ret
214