1/* libs/pixelflinger/t32cb16blend.S
2**
3** Copyright 2006, The Android Open Source Project
4**
5** Licensed under the Apache License, Version 2.0 (the "License");
6** you may not use this file except in compliance with the License.
7** You may obtain a copy of the License at
8**
9**     http://www.apache.org/licenses/LICENSE-2.0
10**
11** Unless required by applicable law or agreed to in writing, software
12** distributed under the License is distributed on an "AS IS" BASIS,
13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14** See the License for the specific language governing permissions and
15** limitations under the License.
16*/
17
18
19	.text
20	.syntax unified
21	.align
22
23	.global scanline_t32cb16blend_arm
24
25
26/*
27 * .macro pixel
28 *
29 * \DREG is a 32-bit register containing *two* original destination RGB565
30 *       pixels, with the even one in the low-16 bits, and the odd one in the
31 *       high 16 bits.
32 *
33 * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
34 *
35 * \FB is a target register that will contain the blended pixel values.
36 *
37 * \ODD is either 0 or 1 and indicates if we're blending the lower or
38 *      upper 16-bit pixels in DREG into FB
39 *
40 *
41 * clobbered: r6, r7, lr
42 *
43 */
44
45.macro pixel,   DREG, SRC, FB, ODD
46
47    // SRC = 0xAABBGGRR
48    mov     r7, \SRC, lsr #24           // sA
49    add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
50    rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))
51
521:
53
54.if \ODD
55
56    // red
57    mov     lr, \DREG, lsr #(16 + 11)
58    smulbb  lr, r7, lr
59    mov     r6, \SRC, lsr #3
60    and     r6, r6, #0x1F
61    add     lr, r6, lr, lsr #8
62    cmp     lr, #0x1F
63    orrhs   \FB, \FB, #(0x1F<<(16 + 11))
64    orrlo   \FB, \FB, lr, lsl #(16 + 11)
65
66        // green
67        and     r6, \DREG, #(0x3F<<(16 + 5))
68        smulbt  r6, r7, r6
69        mov     lr, \SRC, lsr #(8+2)
70        and     lr, lr, #0x3F
71        add     r6, lr, r6, lsr #(5+8)
72        cmp     r6, #0x3F
73        orrhs   \FB, \FB, #(0x3F<<(16 + 5))
74        orrlo   \FB, \FB, r6, lsl #(16 + 5)
75
76            // blue
77            and     lr, \DREG, #(0x1F << 16)
78            smulbt  lr, r7, lr
79            mov     r6, \SRC, lsr #(8+8+3)
80            and     r6, r6, #0x1F
81            add     lr, r6, lr, lsr #8
82            cmp     lr, #0x1F
83            orrhs   \FB, \FB, #(0x1F << 16)
84            orrlo   \FB, \FB, lr, lsl #16
85
86.else
87
88    // red
89    mov     lr, \DREG, lsr #11
90    and     lr, lr, #0x1F
91    smulbb  lr, r7, lr
92    mov     r6, \SRC, lsr #3
93    and     r6, r6, #0x1F
94    add     lr, r6, lr, lsr #8
95    cmp     lr, #0x1F
96    movhs   \FB, #(0x1F<<11)
97    movlo   \FB, lr, lsl #11
98
99
100        // green
101        and     r6, \DREG, #(0x3F<<5)
102        smulbb  r6, r7, r6
103        mov     lr, \SRC, lsr #(8+2)
104        and     lr, lr, #0x3F
105        add     r6, lr, r6, lsr #(5+8)
106        cmp     r6, #0x3F
107        orrhs   \FB, \FB, #(0x3F<<5)
108        orrlo   \FB, \FB, r6, lsl #5
109
110            // blue
111            and     lr, \DREG, #0x1F
112            smulbb  lr, r7, lr
113            mov     r6, \SRC, lsr #(8+8+3)
114            and     r6, r6, #0x1F
115            add     lr, r6, lr, lsr #8
116            cmp     lr, #0x1F
117            orrhs   \FB, \FB, #0x1F
118            orrlo   \FB, \FB, lr
119
120.endif
121
122    .endm
123
124
125// r0:  dst ptr
126// r1:  src ptr
127// r2:  count
128// r3:  d
129// r4:  s0
130// r5:  s1
131// r6:  pixel
132// r7:  pixel
133// r8:  free
134// r9:  free
135// r10: free
136// r11: free
137// r12: scratch
138// r14: pixel
139
140scanline_t32cb16blend_arm:
141    stmfd	sp!, {r4-r7, lr}
142
143    pld     [r0]
144    pld     [r1]
145
146    // align DST to 32 bits
147    tst     r0, #0x3
148    beq     aligned
149    subs    r2, r2, #1
150    ldmfdlo sp!, {r4-r7, lr}        // return
151    bxlo    lr
152
153last:
154    ldr     r4, [r1], #4
155    ldrh    r3, [r0]
156    pixel   r3, r4, r12, 0
157    strh    r12, [r0], #2
158
159aligned:
160    subs    r2, r2, #2
161    blo     9f
162
163    // The main loop is unrolled twice and processes 4 pixels
1648:  ldmia   r1!, {r4, r5}
165    // stream the source
166    pld     [r1, #32]
167    add     r0, r0, #4
168    // it's all zero, skip this pixel
169    orrs    r3, r4, r5
170    beq     7f
171
172    // load the destination
173    ldr     r3, [r0, #-4]
174    // stream the destination
175    pld     [r0, #32]
176    pixel   r3, r4, r12, 0
177    pixel   r3, r5, r12, 1
178    // effectively, we're getting write-combining by virtue of the
179    // cpu's write-back cache.
180    str     r12, [r0, #-4]
181
182    // 2nd iterration of the loop, don't stream anything
183    subs    r2, r2, #2
184    movlt   r4, r5
185    blt     9f
186    ldmia   r1!, {r4, r5}
187    add     r0, r0, #4
188    orrs    r3, r4, r5
189    beq     7f
190    ldr     r3, [r0, #-4]
191    pixel   r3, r4, r12, 0
192    pixel   r3, r5, r12, 16
193    str     r12, [r0, #-4]
194
195
1967:  subs    r2, r2, #2
197    bhs     8b
198    mov     r4, r5
199
2009:  adds    r2, r2, #1
201    ldmfdlo sp!, {r4-r7, lr}        // return
202    bxlo    lr
203    b       last
204