1/*
2 * Copyright (C) 2013-2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20#define BLEND_LIST(X) \
21    X(0, CLEAR) \
22    X(1, SRC) \
23    X(2, DST) \
24    X(3, SRC_OVER) \
25    X(4, DST_OVER) \
26    X(5, SRC_IN) \
27    X(6, DST_IN) \
28    X(7, SRC_OUT) \
29    X(8, DST_OUT) \
30    X(9, SRC_ATOP) \
31    X(10, DST_ATOP) \
32    X(11, XOR) \
33    X(14, MULTIPLY) \
34    X(21, DIFFERENCE) \
35    X(34, ADD) \
36    X(35, SUBTRACT)
37
38/* For every blend operation supported, define a macro with just the arithmetic
39 * component.  The rest can be handled later on.
40 *
41 * At entry q0-q3 contain the RGBA data from the destination buffer, and q8-q11
42 * contain the data from the source buffer.  Both have already been split out
43 * into one colour component per register (if necessary).  q3 and q11 contain
44 * the alpha components.
45 *
46 * At the same time as defining the assembly macro, define a corresponding
47 * preprocessor macro indicating any other requirements.
48 *    zipped=0 -- The macro does not require the RGBA components to be
49 *                separated.
50 *    lddst=0  -- The macro does not require data from the destination buffer.
51 *    ldsrc=0  -- The macro does not require data from the source buffer.
52 *    nowrap=1 -- The macro requires no wrapper at all, and should simply be
53 *                inserted without any surrounding load/store or loop code.
54 */
55
56#define params_CLEAR zipped=0, lddst=0, ldsrc=0
57.macro blend_kernel_CLEAR
58        movi    v0.16b, #0
59        movi    v1.16b, #0
60        movi    v2.16b, #0
61        movi    v3.16b, #0
62.endm
63
64#define params_SRC zipped=0, lddst=0
65.macro blend_kernel_SRC
66        mov     v0.16b, v8.16b
67        mov     v1.16b, v9.16b
68        mov     v2.16b, v10.16b
69        mov     v3.16b, v11.16b
70.endm
71
72#define params_DST nowrap=1
73.macro blend_kernel_DST
74        /* nop */
75.endm
76
77#define params_SRC_OVER zipped=1
78.macro blend_kernel_SRC_OVER
79        mvn         v7.16b, v11.16b
80
81        umull2      v12.8h, v7.16b, v0.16b
82        umull       v0.8h,  v7.8b,  v0.8b
83        umull2      v13.8h, v7.16b, v1.16b
84        umull       v1.8h,  v7.8b,  v1.8b
85        umull2      v14.8h, v7.16b, v2.16b
86        umull       v2.8h,  v7.8b,  v2.8b
87        umull2      v15.8h, v7.16b, v3.16b
88        umull       v3.8h,  v7.8b,  v3.8b
89
90        rshrn       v4.8b,  v0.8h,  #8
91        rshrn2      v4.16b, v12.8h, #8
92        rshrn       v5.8b,  v1.8h,  #8
93        rshrn2      v5.16b, v13.8h, #8
94        rshrn       v6.8b,  v2.8h,  #8
95        rshrn2      v6.16b, v14.8h, #8
96        rshrn       v7.8b,  v3.8h,  #8
97        rshrn2      v7.16b, v15.8h, #8
98
99        uaddw       v0.8h,  v0.8h,  v4.8b
100        uaddw2      v12.8h, v12.8h, v4.16b
101        uaddw       v1.8h,  v1.8h,  v5.8b
102        uaddw2      v13.8h, v13.8h, v5.16b
103        uaddw       v2.8h,  v2.8h,  v6.8b
104        uaddw2      v14.8h, v14.8h, v6.16b
105        uaddw       v3.8h,  v3.8h,  v7.8b
106        uaddw2      v15.8h, v15.8h, v7.16b
107
108        rshrn       v0.8b,  v0.8h,  #8
109        rshrn2      v0.16b, v12.8h, #8
110        rshrn       v1.8b,  v1.8h,  #8
111        rshrn2      v1.16b, v13.8h, #8
112        rshrn       v2.8b,  v2.8h,  #8
113        rshrn2      v2.16b, v14.8h, #8
114        rshrn       v3.8b,  v3.8h,  #8
115        rshrn2      v3.16b, v15.8h, #8
116
117        uqadd       v0.16b, v0.16b, v8.16b
118        uqadd       v1.16b, v1.16b, v9.16b
119        uqadd       v2.16b, v2.16b, v10.16b
120        uqadd       v3.16b, v3.16b, v11.16b
121.endm
122
123#define params_DST_OVER zipped=1
124.macro blend_kernel_DST_OVER
125        mvn         v7.16b, v3.16b
126
127        umull2      v12.8h, v7.16b, v8.16b
128        umull       v8.8h,  v7.8b,  v8.8b
129        umull2      v13.8h, v7.16b, v9.16b
130        umull       v9.8h,  v7.8b,  v9.8b
131        umull2      v14.8h, v7.16b, v10.16b
132        umull       v10.8h, v7.8b,  v10.8b
133        umull2      v15.8h, v7.16b, v11.16b
134        umull       v11.8h, v7.8b,  v11.8b
135
136        rshrn       v4.8b,  v8.8h,  #8
137        rshrn2      v4.16b, v12.8h, #8
138        rshrn       v5.8b,  v9.8h,  #8
139        rshrn2      v5.16b, v13.8h, #8
140        rshrn       v6.8b,  v10.8h, #8
141        rshrn2      v6.16b, v14.8h, #8
142        rshrn       v7.8b,  v11.8h, #8
143        rshrn2      v7.16b, v15.8h, #8
144
145        uaddw       v8.8h,  v8.8h,  v4.8b
146        uaddw2      v12.8h, v12.8h, v4.16b
147        uaddw       v9.8h,  v9.8h,  v5.8b
148        uaddw2      v13.8h, v13.8h, v5.16b
149        uaddw       v10.8h, v10.8h, v6.8b
150        uaddw2      v14.8h, v14.8h, v6.16b
151        uaddw       v11.8h, v11.8h, v7.8b
152        uaddw2      v15.8h, v15.8h, v7.16b
153
154        rshrn       v8.8b,  v8.8h,  #8
155        rshrn2      v8.16b, v12.8h, #8
156        rshrn       v9.8b,  v9.8h,  #8
157        rshrn2      v9.16b, v13.8h, #8
158        rshrn       v10.8b,  v10.8h, #8
159        rshrn2      v10.16b, v14.8h, #8
160        rshrn       v11.8b,  v11.8h, #8
161        rshrn2      v11.16b, v15.8h, #8
162
163        uqadd       v0.16b, v0.16b, v8.16b
164        uqadd       v1.16b, v1.16b, v9.16b
165        uqadd       v2.16b, v2.16b, v10.16b
166        uqadd       v3.16b, v3.16b, v11.16b
167.endm
168
169#define params_SRC_IN zipped=1
170.macro blend_kernel_SRC_IN
171        umull2      v12.8h, v3.16b, v8.16b
172        umull       v0.8h,  v3.8b,  v8.8b
173        umull2      v13.8h, v3.16b, v9.16b
174        umull       v1.8h,  v3.8b,  v9.8b
175        umull2      v14.8h, v3.16b, v10.16b
176        umull       v2.8h,  v3.8b,  v10.8b
177        umull2      v15.8h, v3.16b, v11.16b
178        umull       v3.8h,  v3.8b,  v11.8b
179
180        rshrn       v4.8b,  v0.8h,  #8
181        rshrn2      v4.16b, v12.8h, #8
182        rshrn       v5.8b,  v1.8h,  #8
183        rshrn2      v5.16b, v13.8h, #8
184        rshrn       v6.8b,  v2.8h,  #8
185        rshrn2      v6.16b, v14.8h, #8
186        rshrn       v7.8b,  v3.8h,  #8
187        rshrn2      v7.16b, v15.8h, #8
188
189        uaddw       v0.8h,  v0.8h,  v4.8b
190        uaddw2      v12.8h, v12.8h, v4.16b
191        uaddw       v1.8h,  v1.8h,  v5.8b
192        uaddw2      v13.8h, v13.8h, v5.16b
193        uaddw       v2.8h,  v2.8h,  v6.8b
194        uaddw2      v14.8h, v14.8h, v6.16b
195        uaddw       v3.8h,  v3.8h,  v7.8b
196        uaddw2      v15.8h, v15.8h, v7.16b
197
198        rshrn       v0.8b,  v0.8h,  #8
199        rshrn2      v0.16b, v12.8h, #8
200        rshrn       v1.8b,  v1.8h,  #8
201        rshrn2      v1.16b, v13.8h, #8
202        rshrn       v2.8b,  v2.8h,  #8
203        rshrn2      v2.16b, v14.8h, #8
204        rshrn       v3.8b,  v3.8h,  #8
205        rshrn2      v3.16b, v15.8h, #8
206.endm
207
208#define params_DST_IN zipped=1
209.macro blend_kernel_DST_IN
210        umull2      v12.8h, v0.16b, v11.16b
211        umull       v0.8h,  v0.8b,  v11.8b
212        umull2      v13.8h, v1.16b, v11.16b
213        umull       v1.8h,  v1.8b,  v11.8b
214        umull2      v14.8h, v2.16b, v11.16b
215        umull       v2.8h,  v2.8b,  v11.8b
216        umull2      v15.8h, v3.16b, v11.16b
217        umull       v3.8h,  v3.8b,  v11.8b
218
219        rshrn       v4.8b,  v0.8h,  #8
220        rshrn2      v4.16b, v12.8h, #8
221        rshrn       v5.8b,  v1.8h,  #8
222        rshrn2      v5.16b, v13.8h, #8
223        rshrn       v6.8b,  v2.8h,  #8
224        rshrn2      v6.16b, v14.8h, #8
225        rshrn       v7.8b,  v3.8h,  #8
226        rshrn2      v7.16b, v15.8h, #8
227
228        uaddw       v0.8h,  v0.8h,  v4.8b
229        uaddw2      v12.8h, v12.8h, v4.16b
230        uaddw       v1.8h,  v1.8h,  v5.8b
231        uaddw2      v13.8h, v13.8h, v5.16b
232        uaddw       v2.8h,  v2.8h,  v6.8b
233        uaddw2      v14.8h, v14.8h, v6.16b
234        uaddw       v3.8h,  v3.8h,  v7.8b
235        uaddw2      v15.8h, v15.8h, v7.16b
236
237        rshrn       v0.8b,  v0.8h,  #8
238        rshrn2      v0.16b, v12.8h, #8
239        rshrn       v1.8b,  v1.8h,  #8
240        rshrn2      v1.16b, v13.8h, #8
241        rshrn       v2.8b,  v2.8h,  #8
242        rshrn2      v2.16b, v14.8h, #8
243        rshrn       v3.8b,  v3.8h,  #8
244        rshrn2      v3.16b, v15.8h, #8
245.endm
246
247#define params_SRC_OUT zipped=1
248.macro blend_kernel_SRC_OUT
249        mvn         v3.16b, v3.16b
250        blend_kernel_SRC_IN
251.endm
252
253
254#define params_DST_OUT zipped=1
255.macro blend_kernel_DST_OUT
256        mvn         v11.16b, v11.16b
257        blend_kernel_DST_IN
258.endm
259
260#define params_SRC_ATOP zipped=1
261.macro blend_kernel_SRC_ATOP
262        mvn         v11.16b, v11.16b
263
264        umull2      v12.8h, v11.16b, v0.16b
265        umull       v0.8h,  v11.8b,  v0.8b
266        umull2      v13.8h, v11.16b, v1.16b
267        umull       v1.8h,  v11.8b,  v1.8b
268        umull2      v14.8h, v11.16b, v2.16b
269        umull       v2.8h,  v11.8b,  v2.8b
270
271        umull2      v4.8h,  v3.16b, v8.16b
272        umull       v8.8h,  v3.8b,  v8.8b
273        umull2      v5.8h,  v3.16b, v9.16b
274        umull       v9.8h,  v3.8b,  v9.8b
275        umull2      v6.8h,  v3.16b, v10.16b
276        umull       v10.8h, v3.8b,  v10.8b
277
278        uqadd       v12.8h, v12.8h, v4.8h
279        uqadd       v0.8h,  v0.8h,  v8.8h
280        uqadd       v13.8h, v13.8h, v5.8h
281        uqadd       v1.8h,  v1.8h,  v9.8h
282        uqadd       v14.8h, v14.8h, v6.8h
283        uqadd       v2.8h,  v2.8h,  v10.8h
284
285        urshr       v8.8h,  v0.8h,  #8
286        urshr       v4.8h,  v12.8h, #8
287        urshr       v9.8h,  v1.8h,  #8
288        urshr       v5.8h,  v13.8h, #8
289        urshr       v10.8h, v2.8h,  #8
290        urshr       v6.8h,  v14.8h, #8
291
292        uqadd       v0.8h,  v0.8h,  v8.8h
293        uqadd       v12.8h, v12.8h, v4.8h
294        uqadd       v1.8h,  v1.8h,  v9.8h
295        uqadd       v13.8h, v13.8h, v5.8h
296        uqadd       v2.8h,  v2.8h,  v10.8h
297        uqadd       v14.8h, v14.8h, v6.8h
298
299        uqrshrn     v0.8b,  v0.8h,  #8
300        uqrshrn2    v0.16b, v12.8h, #8
301        uqrshrn     v1.8b,  v1.8h,  #8
302        uqrshrn2    v1.16b, v13.8h, #8
303        uqrshrn     v2.8b,  v2.8h,  #8
304        uqrshrn2    v2.16b, v14.8h, #8
305.endm
306
307#define params_DST_ATOP zipped=1
308.macro blend_kernel_DST_ATOP
309        mvn         v3.16b, v3.16b
310
311        umull2      v12.8h, v11.16b, v0.16b
312        umull       v0.8h,  v11.8b,  v0.8b
313        umull2      v13.8h, v11.16b, v1.16b
314        umull       v1.8h,  v11.8b,  v1.8b
315        umull2      v14.8h, v11.16b, v2.16b
316        umull       v2.8h,  v11.8b,  v2.8b
317
318        umull2      v4.8h,  v3.16b, v8.16b
319        umull       v8.8h,  v3.8b,  v8.8b
320        umull2      v5.8h,  v3.16b, v9.16b
321        umull       v9.8h,  v3.8b,  v9.8b
322        umull2      v6.8h,  v3.16b, v10.16b
323        umull       v10.8h, v3.8b,  v10.8b
324
325        uqadd       v12.8h, v12.8h, v4.8h
326        uqadd       v0.8h,  v0.8h,  v8.8h
327        uqadd       v13.8h, v13.8h, v5.8h
328        uqadd       v1.8h,  v1.8h,  v9.8h
329        uqadd       v14.8h, v14.8h, v6.8h
330        uqadd       v2.8h,  v2.8h,  v10.8h
331
332        urshr       v8.8h,  v0.8h,  #8
333        urshr       v4.8h,  v12.8h, #8
334        urshr       v9.8h,  v1.8h,  #8
335        urshr       v5.8h,  v13.8h, #8
336        urshr       v10.8h, v2.8h,  #8
337        urshr       v6.8h,  v14.8h, #8
338
339        uqadd       v0.8h,  v0.8h,  v8.8h
340        uqadd       v12.8h, v12.8h, v4.8h
341        uqadd       v1.8h,  v1.8h,  v9.8h
342        uqadd       v13.8h, v13.8h, v5.8h
343        uqadd       v2.8h,  v2.8h,  v10.8h
344        uqadd       v14.8h, v14.8h, v6.8h
345
346        uqrshrn     v0.8b,  v0.8h,  #8
347        uqrshrn2    v0.16b, v12.8h, #8
348        uqrshrn     v1.8b,  v1.8h,  #8
349        uqrshrn2    v1.16b, v13.8h, #8
350        uqrshrn     v2.8b,  v2.8h,  #8
351        uqrshrn2    v2.16b, v14.8h, #8
352
353        mov         v3.16b, v11.16b
354.endm
355
356#define params_MULTIPLY zipped=0
357.macro blend_kernel_MULTIPLY
358        umull2      v12.8h, v0.16b, v8.16b
359        umull       v0.8h,  v0.8b,  v8.8b
360        umull2      v13.8h, v1.16b, v9.16b
361        umull       v1.8h,  v1.8b,  v9.8b
362        umull2      v14.8h, v2.16b, v10.16b
363        umull       v2.8h,  v2.8b,  v10.8b
364        umull2      v15.8h, v3.16b, v11.16b
365        umull       v3.8h,  v3.8b,  v11.8b
366
367        rshrn       v4.8b,  v0.8h,  #8
368        rshrn2      v4.16b, v12.8h, #8
369        rshrn       v5.8b,  v1.8h,  #8
370        rshrn2      v5.16b, v13.8h, #8
371        rshrn       v6.8b,  v2.8h,  #8
372        rshrn2      v6.16b, v14.8h, #8
373        rshrn       v7.8b,  v3.8h,  #8
374        rshrn2      v7.16b, v15.8h, #8
375
376        uaddw       v0.8h,  v0.8h,  v4.8b
377        uaddw2      v12.8h, v12.8h, v4.16b
378        uaddw       v1.8h,  v1.8h,  v5.8b
379        uaddw2      v13.8h, v13.8h, v5.16b
380        uaddw       v2.8h,  v2.8h,  v6.8b
381        uaddw2      v14.8h, v14.8h, v6.16b
382        uaddw       v3.8h,  v3.8h,  v7.8b
383        uaddw2      v15.8h, v15.8h, v7.16b
384
385        rshrn       v0.8b,  v0.8h,  #8
386        rshrn2      v0.16b, v12.8h, #8
387        rshrn       v1.8b,  v1.8h,  #8
388        rshrn2      v1.16b, v13.8h, #8
389        rshrn       v2.8b,  v2.8h,  #8
390        rshrn2      v2.16b, v14.8h, #8
391        rshrn       v3.8b,  v3.8h,  #8
392        rshrn2      v3.16b, v15.8h, #8
393.endm
394
395#define params_ADD zipped=0
396.macro blend_kernel_ADD
397        uqadd    v0.16b, v0.16b, v8.16b
398        uqadd    v1.16b, v1.16b, v9.16b
399        uqadd    v2.16b, v2.16b, v10.16b
400        uqadd    v3.16b, v3.16b, v11.16b
401.endm
402
403#define params_SUBTRACT zipped=0
404.macro blend_kernel_SUBTRACT
405        uqsub    v0.16b, v0.16b, v8.16b
406        uqsub    v1.16b, v1.16b, v9.16b
407        uqsub    v2.16b, v2.16b, v10.16b
408        uqsub    v3.16b, v3.16b, v11.16b
409.endm
410
411#define params_DIFFERENCE zipped=0
412.macro blend_kernel_DIFFERENCE
413        uabd    v0.16b, v0.16b, v8.16b
414        uabd    v1.16b, v1.16b, v9.16b
415        uabd    v2.16b, v2.16b, v10.16b
416        uabd    v3.16b, v3.16b, v11.16b
417.endm
418
419#define params_XOR zipped=0
420.macro blend_kernel_XOR
421        eor     v0.16b, v0.16b, v8.16b
422        eor     v1.16b, v1.16b, v9.16b
423        eor     v2.16b, v2.16b, v10.16b
424        eor     v3.16b, v3.16b, v11.16b
425.endm
426
427
428/* Define the wrapper code which will load and store the data, iterate the
429 * correct number of times, and safely handle the remainder at the end of the
430 * loop.  Various sections of assembly code are dropped or substituted for
431 * simpler operations if they're not needed.
432 */
433.macro wrap_line kernel, nowrap=0, zipped=1, lddst=1, ldsrc=1, pld=1
434.if \nowrap
435        \kernel
436.else
437        sub     x3, sp, #32
438        sub     sp, sp, #64
439        st1     {v8.1d - v11.1d}, [sp]
440        st1     {v12.1d - v15.1d}, [x3]
441        subs    x2, x2, #64
442        b       2f
443.align 4
4441:
445  .if \lddst
446    .if \zipped
447        ld4     {v0.16b - v3.16b}, [x0]
448    .else
449        ld1     {v0.16b - v3.16b}, [x0]
450    .endif
451  .endif
452  .if \ldsrc
453    .if \zipped
454        ld4     {v8.16b - v11.16b}, [x1], #64
455    .else
456        ld1     {v8.16b - v11.16b}, [x1], #64
457    .endif
458  .endif
459  .if \pld
460#if 0 /* TODO: test this on real hardware */
461    .if \lddst ; prfm PLDL1STRM, [x0, #192] ; .endif
462    .if \ldsrc ; prfm PLDL1STRM, [x1, #192] ; .endif
463#endif
464  .endif
465
466        \kernel
467
468        subs    x2, x2, #64
469  .if \zipped
470        st4     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
471  .else
472        st1     {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
473  .endif
474
4752:      bge     1b
476        adds    x2, x2, #64
477        beq     2f
478
479        /* To handle the tail portion of the data (something less than 64
480         * bytes) load small power-of-two chunks into working registers.  It
481         * doesn't matter where they end up in the register; the same process
482         * will store them back out using the same positions and the operations
483         * don't require data to interact with its neighbours.
484         */
485        movi    v0.16b, #0
486        movi    v1.16b, #0
487        movi    v2.16b, #0
488        movi    v3.16b, #0
489
490        movi    v8.16b, #0
491        movi    v9.16b, #0
492        movi    v10.16b, #0
493        movi    v11.16b, #0
494
495        tbz     x2, #5, 1f
496  .if \lddst ; ld1     {v2.16b,v3.16b}, [x0], #32   ; .endif
497  .if \ldsrc ; ld1     {v10.16b,v11.16b}, [x1], #32 ; .endif
4981:      tbz     x2, #4, 1f
499  .if \lddst ; ld1     {v1.16b}, [x0], #16  ; .endif
500  .if \ldsrc ; ld1     {v9.16b}, [x1], #16  ; .endif
5011:      tbz     x2, #3, 1f
502  .if \lddst ; ld1     {v0.d}[1], [x0], #8 ; .endif
503  .if \ldsrc ; ld1     {v8.d}[1], [x1], #8 ; .endif
5041:      tbz     x2, #2, 1f
505  .if \lddst ; ld1     {v0.s}[1], [x0], #4 ; .endif
506  .if \ldsrc ; ld1     {v8.s}[1], [x1], #4 ; .endif
5071:      tbz     x2, #1, 1f
508  .if \lddst ; ld1     {v0.h}[1], [x0], #2 ; .endif
509  .if \ldsrc ; ld1     {v8.h}[1], [x1], #2 ; .endif
5101:      tbz     x2, #0, 1f
511  .if \lddst ; ld1     {v0.b}[1], [x0], #1 ; .endif
512  .if \ldsrc ; ld1     {v8.b}[1], [x1], #1 ; .endif
5131:
514  .if \lddst ; sub     x0, x0, x2           ; .endif
515
516.if \zipped
517        /* One small impediment in the process above is that some of the load
518         * operations can't perform byte-wise structure deinterleaving at the
519         * same time as loading only part of a register.  So the data is loaded
520         * linearly and unpacked manually at this point.
521         */
522        uzp1    v4.16b, v0.16b, v1.16b
523        uzp2    v5.16b, v0.16b, v1.16b
524        uzp1    v6.16b, v2.16b, v3.16b
525        uzp2    v7.16b, v2.16b, v3.16b
526        uzp1    v0.16b, v4.16b, v6.16b
527        uzp2    v2.16b, v4.16b, v6.16b
528        uzp1    v1.16b, v5.16b, v7.16b
529        uzp2    v3.16b, v5.16b, v7.16b
530
531        uzp1    v4.16b, v8.16b, v9.16b
532        uzp2    v5.16b, v8.16b, v9.16b
533        uzp1    v6.16b, v10.16b, v11.16b
534        uzp2    v7.16b, v10.16b, v11.16b
535        uzp1    v8.16b, v4.16b, v6.16b
536        uzp2    v10.16b, v4.16b, v6.16b
537        uzp1    v9.16b, v5.16b, v7.16b
538        uzp2    v11.16b, v5.16b, v7.16b
539
540        \kernel
541
542        zip1    v4.16b, v0.16b, v2.16b
543        zip2    v6.16b, v0.16b, v2.16b
544        zip1    v5.16b, v1.16b, v3.16b
545        zip2    v7.16b, v1.16b, v3.16b
546        zip1    v0.16b, v4.16b, v5.16b
547        zip2    v1.16b, v4.16b, v5.16b
548        zip1    v2.16b, v6.16b, v7.16b
549        zip2    v3.16b, v6.16b, v7.16b
550  .else
551        \kernel
552  .endif
553
554        tbz     x2, #5, 1f
555        st1     {v2.16b,v3.16b}, [x0], #32
5561:      tbz     x2, #4, 1f
557        st1     {v1.16b}, [x0], #16
5581:      tbz     x2, #3, 1f
559        st1     {v0.d}[1], [x0], #8
5601:      tbz     x2, #2, 1f
561        st1     {v0.s}[1], [x0], #4
5621:      tbz     x2, #1, 1f
563        st1     {v0.h}[1], [x0], #2
5641:      tbz     x2, #0, 2f
565        st1     {v0.b}[1], [x0], #1
5662:      ld1     {v8.1d - v11.1d}, [sp], #32
567        ld1     {v12.1d - v15.1d}, [sp], #32
568.endif
569        mov     x0, #0
570        ret
571.endm
572
573
574/* produce list of blend_line_XX() functions; each function uses the wrap_line
575 * macro, passing it the name of the operation macro it wants along with
576 * optional parameters to remove unnecessary operations.
577 */
578#define BLEND_X(d, n) ENTRY(blend_line_##n) ; wrap_line blend_kernel_##n, params_##n ; END(blend_line_##n) ;
579    BLEND_LIST(BLEND_X)
580#undef BLEND_X
581
582#define BLEND_X(d, n) .set tablesize, d+1 ;
583    BLEND_LIST(BLEND_X)
584#undef BLEND_X
585
586/*  int rsdIntrinsicBlend_K(
587 *          uchar4 *out,        // x0
588 *          uchar4 const *in,   // x1
589 *          int slot,           // x2
590 *          size_t xstart,      // x3
591 *          size_t xend);       // x4
592 */
593ENTRY(rsdIntrinsicBlend_K)
594    adrp    x5, blendtable
595    add     x5, x5, :lo12:blendtable
596    cmp     w2, tablesize
597    bhs     1f
598    ldrsh   x6, [x5, w2, uxtw #1]
599    add     x0, x0, w3, uxtw #2
600    add     x1, x1, w3, uxtw #2
601    sub     w2, w4, w3
602    ubfiz   x2, x2, #2, #32 /* TODO: fix */
603    cbz     x6, 1f
604    adr     x5, 2f
605    add     x6, x5, x6
6062:  br      x6
6071:  mov     x0, #-1
608    ret
609
610END(rsdIntrinsicBlend_K)
611
612.rodata
613.set off,0
614blendtable:
615#define BLEND_X(d, n) .rept d-off ; .hword 0 ; .endr ; .hword blend_line_##n - 2b ; .set off, d+1 ;
616        BLEND_LIST(BLEND_X)
617#undef BLEND_X
618