1/*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f:
18#define END(f) .size f, .-f;
19
20
21.macro vmxx_f32 i, mask, opd, opa, opb
22  .if (\i) & \mask
23    .if (\i) & (\mask - 1)
24        fmla            \opd, \opa, \opb
25    .else
26        fmul            \opd, \opa, \opb
27    .endif
28  .endif
29.endm
30
31.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2
32  .if (\i) & \mask
33    .if (\i) & (\mask - 1)
34        fadd            \opd, \opa, \opb
35    .else
36        mov             \stupidsyntax1, \stupidsyntax2
37    .endif
38  .endif
39.endm
40
41.macro vmxx_s16 i, mask, opd, opa, opb
42  .if (\i) & \mask
43    .if (\i) & (\mask - 1 + 16)
44        smlal           \opd, \opa, \opb
45    .else
46        smull           \opd, \opa, \opb
47    .endif
48  .endif
49.endm
50
51.macro vmxx2_s16 i, mask, opd, opa, opb
52  .if (\i) & \mask
53    .if (\i) & (\mask - 1 + 16)
54        smlal2          \opd, \opa, \opb
55    .else
56        smull2          \opd, \opa, \opb
57    .endif
58  .endif
59.endm
60
61/* x0 = dst
62 * x1 = src
63 * x2 = count
64 * x3 = params
65 * x4 = column0_fn
66 * x5 = column1_fn
67 * x6 = column2_fn
68 * x7 = column3_fn
69 * x8 = store_fn
70 * x9 = load_fn
71 */
72.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
73
74.align 6
75colormatrix_int_col0_\i:
76      .if \i & 16
77            dup         v6.4s, v4.s[0]
78            dup         v7.4s, v4.s[0]
79      .endif
80            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[0]
81            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[4]
82            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[0]
83            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[4]
84            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[0]
85            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[4]
86            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[0]
87            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[4]
88            sqshrun     v8.4h, v6.4s, #8
89            sqshrun2    v8.8h, v7.4s, #8
90            br          x5
91
92colormatrix_int_col0_n\i:
93      .if (\i^31) & 16
94            dup         v6.4s, v4.s[0]
95            dup         v7.4s, v4.s[0]
96      .endif
97            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[0]
98            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[4]
99            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[0]
100            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[4]
101            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[0]
102            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[4]
103            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[0]
104            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[4]
105            sqshrun     v8.4h, v6.4s, #8
106            sqshrun2    v8.8h, v7.4s, #8
107            br          x5
108
109.align 6
110colormatrix_int_col1_\i:
111      .if \i & 16
112            dup         v6.4s, v4.s[1]
113            dup         v7.4s, v4.s[1]
114      .endif
115            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[1]
116            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[5]
117            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[1]
118            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[5]
119            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[1]
120            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[5]
121            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[1]
122            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[5]
123            sqshrun     v9.4h, v6.4s, #8
124            sqshrun2    v9.8h, v7.4s, #8
125            br          x6
126
127colormatrix_int_col1_n\i:
128      .if (\i^31) & 16
129            dup         v6.4s, v4.s[1]
130            dup         v7.4s, v4.s[1]
131      .endif
132            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[1]
133            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[5]
134            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[1]
135            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[5]
136            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[1]
137            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[5]
138            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[1]
139            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[5]
140            sqshrun     v9.4h, v6.4s, #8
141            sqshrun2    v9.8h, v7.4s, #8
142            br          x6
143
144.align 6
145colormatrix_int_col2_\i:
146      .if \i & 16
147            dup         v6.4s, v4.s[2]
148            dup         v7.4s, v4.s[2]
149      .endif
150            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[2]
151            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[6]
152            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[2]
153            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[6]
154            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[2]
155            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[6]
156            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[2]
157            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[6]
158            sqshrun     v10.4h, v6.4s, #8
159            sqshrun2    v10.8h, v7.4s, #8
160            br          x7
161
162colormatrix_int_col2_n\i:
163      .if (\i^31) & 16
164            dup         v6.4s, v4.s[2]
165            dup         v7.4s, v4.s[2]
166      .endif
167            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[2]
168            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[6]
169            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[2]
170            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[6]
171            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[2]
172            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[6]
173            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[2]
174            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[6]
175            sqshrun     v10.4h, v6.4s, #8
176            sqshrun2    v10.8h, v7.4s, #8
177            br          x7
178
179.align 6
180colormatrix_int_col3_\i:
181      .if \i & 16
182            dup         v6.4s, v4.s[3]
183            dup         v7.4s, v4.s[3]
184      .endif
185            vmxx_s16    \i, 1, v6.4s, v12.4h, v0.h[3]
186            vmxx_s16    \i, 2, v6.4s, v13.4h, v0.h[7]
187            vmxx_s16    \i, 4, v6.4s, v14.4h, v1.h[3]
188            vmxx_s16    \i, 8, v6.4s, v15.4h, v1.h[7]
189            vmxx2_s16   \i, 1, v7.4s, v12.8h, v0.h[3]
190            vmxx2_s16   \i, 2, v7.4s, v13.8h, v0.h[7]
191            vmxx2_s16   \i, 4, v7.4s, v14.8h, v1.h[3]
192            vmxx2_s16   \i, 8, v7.4s, v15.8h, v1.h[7]
193            sqshrun     v11.4h, v6.4s, #8
194            sqshrun2    v11.8h, v7.4s, #8
195            br          x8
196
197colormatrix_int_col3_n\i:
198      .if (\i^31) & 16
199            dup         v6.4s, v4.s[3]
200            dup         v7.4s, v4.s[3]
201      .endif
202            vmxx_s16    \i^31, 1, v6.4s, v12.4h, v0.h[3]
203            vmxx_s16    \i^31, 2, v6.4s, v13.4h, v0.h[7]
204            vmxx_s16    \i^31, 4, v6.4s, v14.4h, v1.h[3]
205            vmxx_s16    \i^31, 8, v6.4s, v15.4h, v1.h[7]
206            vmxx2_s16   \i^31, 1, v7.4s, v12.8h, v0.h[3]
207            vmxx2_s16   \i^31, 2, v7.4s, v13.8h, v0.h[7]
208            vmxx2_s16   \i^31, 4, v7.4s, v14.8h, v1.h[3]
209            vmxx2_s16   \i^31, 8, v7.4s, v15.8h, v1.h[7]
210            sqshrun     v11.4h, v6.4s, #8
211            sqshrun2    v11.8h, v7.4s, #8
212            br          x8
213
214.align 5
215colormatrix_float_col0_\i:
216            vmxx_f32    \i, 1,  v8.4s, v12.4s, v0.s[0]
217            vmxx_f32    \i, 2,  v8.4s, v13.4s, v1.s[0]
218            vmxx_f32    \i, 4,  v8.4s, v14.4s, v2.s[0]
219            vmxx_f32    \i, 8,  v8.4s, v15.4s, v3.s[0]
220            vadd_f32    \i, 16, v8.4s, v8.4s, v4.4s,        v8.16b, v4.16b
221            vmxx_f32    \i, 1,  v16.4s, v20.4s, v0.s[0]
222            vmxx_f32    \i, 2,  v16.4s, v21.4s, v1.s[0]
223            vmxx_f32    \i, 4,  v16.4s, v22.4s, v2.s[0]
224            vmxx_f32    \i, 8,  v16.4s, v23.4s, v3.s[0]
225            vadd_f32    \i, 16, v16.4s, v16.4s, v4.4s,      v16.16b, v4.16b
226            br          x5
227
228.align 4
229colormatrix_float_col0_n\i:
230            vmxx_f32    \i^31, 1,  v8.4s, v12.4s, v0.s[0]
231            vmxx_f32    \i^31, 2,  v8.4s, v13.4s, v1.s[0]
232            vmxx_f32    \i^31, 4,  v8.4s, v14.4s, v2.s[0]
233            vmxx_f32    \i^31, 8,  v8.4s, v15.4s, v3.s[0]
234            vadd_f32    \i^31, 16, v8.4s, v8.4s, v4.4s,     v8.16b, v4.16b
235            vmxx_f32    \i^31, 1,  v16.4s, v20.4s, v0.s[0]
236            vmxx_f32    \i^31, 2,  v16.4s, v21.4s, v1.s[0]
237            vmxx_f32    \i^31, 4,  v16.4s, v22.4s, v2.s[0]
238            vmxx_f32    \i^31, 8,  v16.4s, v23.4s, v3.s[0]
239            vadd_f32    \i^31, 16, v16.4s, v16.4s, v4.4s,   v16.16b, v4.16b
240            br          x5
241
242.align 5
243colormatrix_float_col1_\i:
244            vmxx_f32    \i, 1,  v9.4s, v12.4s, v0.s[1]
245            vmxx_f32    \i, 2,  v9.4s, v13.4s, v1.s[1]
246            vmxx_f32    \i, 4,  v9.4s, v14.4s, v2.s[1]
247            vmxx_f32    \i, 8,  v9.4s, v15.4s, v3.s[1]
248            vadd_f32    \i, 16, v9.4s, v9.4s, v5.4s,        v9.16b, v5.16b
249            vmxx_f32    \i, 1,  v17.4s, v20.4s, v0.s[1]
250            vmxx_f32    \i, 2,  v17.4s, v21.4s, v1.s[1]
251            vmxx_f32    \i, 4,  v17.4s, v22.4s, v2.s[1]
252            vmxx_f32    \i, 8,  v17.4s, v23.4s, v3.s[1]
253            vadd_f32    \i, 16, v17.4s, v17.4s, v5.4s,      v17.16b, v5.16b
254            br          x6
255
256.align 4
257colormatrix_float_col1_n\i:
258            vmxx_f32    \i^31, 1,  v9.4s, v12.4s, v0.s[1]
259            vmxx_f32    \i^31, 2,  v9.4s, v13.4s, v1.s[1]
260            vmxx_f32    \i^31, 4,  v9.4s, v14.4s, v2.s[1]
261            vmxx_f32    \i^31, 8,  v9.4s, v15.4s, v3.s[1]
262            vadd_f32    \i^31, 16, v9.4s, v9.4s, v5.4s,     v9.16b, v5.16b
263            vmxx_f32    \i^31, 1,  v17.4s, v20.4s, v0.s[1]
264            vmxx_f32    \i^31, 2,  v17.4s, v21.4s, v1.s[1]
265            vmxx_f32    \i^31, 4,  v17.4s, v22.4s, v2.s[1]
266            vmxx_f32    \i^31, 8,  v17.4s, v23.4s, v3.s[1]
267            vadd_f32    \i^31, 16, v17.4s, v17.4s, v5.4s,   v17.16b, v5.16b
268            br          x6
269
270.align 5
271colormatrix_float_col2_\i:
272            vmxx_f32    \i, 1,  v10.4s, v12.4s, v0.s[2]
273            vmxx_f32    \i, 2,  v10.4s, v13.4s, v1.s[2]
274            vmxx_f32    \i, 4,  v10.4s, v14.4s, v2.s[2]
275            vmxx_f32    \i, 8,  v10.4s, v15.4s, v3.s[2]
276            vadd_f32    \i, 16, v10.4s, v10.4s, v6.4s,      v10.16b, v6.16b
277            vmxx_f32    \i, 1,  v18.4s, v20.4s, v0.s[2]
278            vmxx_f32    \i, 2,  v18.4s, v21.4s, v1.s[2]
279            vmxx_f32    \i, 4,  v18.4s, v22.4s, v2.s[2]
280            vmxx_f32    \i, 8,  v18.4s, v23.4s, v3.s[2]
281            vadd_f32    \i, 16, v18.4s, v18.4s, v6.4s,      v18.16b, v6.16b
282            br          x7
283
284.align 4
285colormatrix_float_col2_n\i:
286            vmxx_f32    \i^31, 1,  v10.4s, v12.4s, v0.s[2]
287            vmxx_f32    \i^31, 2,  v10.4s, v13.4s, v1.s[2]
288            vmxx_f32    \i^31, 4,  v10.4s, v14.4s, v2.s[2]
289            vmxx_f32    \i^31, 8,  v10.4s, v15.4s, v3.s[2]
290            vadd_f32    \i^31, 16, v10.4s, v10.4s, v6.4s,   v10.16b, v6.16b
291            vmxx_f32    \i^31, 1,  v18.4s, v20.4s, v0.s[2]
292            vmxx_f32    \i^31, 2,  v18.4s, v21.4s, v1.s[2]
293            vmxx_f32    \i^31, 4,  v18.4s, v22.4s, v2.s[2]
294            vmxx_f32    \i^31, 8,  v18.4s, v23.4s, v3.s[2]
295            vadd_f32    \i^31, 16, v18.4s, v18.4s, v6.4s,   v18.16b, v6.16b
296            br          x7
297
298.align 5
299colormatrix_float_col3_\i:
300            vmxx_f32    \i, 1,  v11.4s, v12.4s, v0.s[3]
301            vmxx_f32    \i, 2,  v11.4s, v13.4s, v1.s[3]
302            vmxx_f32    \i, 4,  v11.4s, v14.4s, v2.s[3]
303            vmxx_f32    \i, 8,  v11.4s, v15.4s, v3.s[3]
304            vadd_f32    \i, 16, v11.4s, v11.4s, v7.4s,      v11.16b, v7.16b
305            vmxx_f32    \i, 1,  v19.4s, v20.4s, v0.s[3]
306            vmxx_f32    \i, 2,  v19.4s, v21.4s, v1.s[3]
307            vmxx_f32    \i, 4,  v19.4s, v22.4s, v2.s[3]
308            vmxx_f32    \i, 8,  v19.4s, v23.4s, v3.s[3]
309            vadd_f32    \i, 16, v19.4s, v19.4s, v7.4s,      v19.16b, v7.16b
310            br          x8
311
312.align 4
313colormatrix_float_col3_n\i:
314            vmxx_f32    \i^31, 1,  v11.4s, v12.4s, v0.s[3]
315            vmxx_f32    \i^31, 2,  v11.4s, v13.4s, v1.s[3]
316            vmxx_f32    \i^31, 4,  v11.4s, v14.4s, v2.s[3]
317            vmxx_f32    \i^31, 8,  v11.4s, v15.4s, v3.s[3]
318            vadd_f32    \i^31, 16, v11.4s, v11.4s, v7.4s,  v11.16b, v7.16b
319            vmxx_f32    \i^31, 1,  v19.4s, v20.4s, v0.s[3]
320            vmxx_f32    \i^31, 2,  v19.4s, v21.4s, v1.s[3]
321            vmxx_f32    \i^31, 4,  v19.4s, v22.4s, v2.s[3]
322            vmxx_f32    \i^31, 8,  v19.4s, v23.4s, v3.s[3]
323            vadd_f32    \i^31, 16, v19.4s, v19.4s, v7.4s,  v19.16b, v7.16b
324            br          x8
325
326.endr
327
328.align 6
329colormatrix_float_ldu4:
330            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
331            uxtl        v20.8h, v20.8b
332            uxtl        v21.8h, v21.8b
333            uxtl        v22.8h, v22.8b
334            uxtl        v23.8h, v23.8b
335            uxtl        v12.4s, v20.4h
336            uxtl        v13.4s, v21.4h
337            uxtl        v14.4s, v22.4h
338            uxtl        v15.4s, v23.4h
339            uxtl2       v20.4s, v20.8h
340            uxtl2       v21.4s, v21.8h
341            uxtl2       v22.4s, v22.8h
342            uxtl2       v23.4s, v23.8h
343            ucvtf       v12.4s, v12.4s
344            ucvtf       v13.4s, v13.4s
345            ucvtf       v14.4s, v14.4s
346            ucvtf       v15.4s, v15.4s
347            ucvtf       v20.4s, v20.4s
348            ucvtf       v21.4s, v21.4s
349            ucvtf       v22.4s, v22.4s
350            ucvtf       v23.4s, v23.4s
351            br          x4
352
353.align 5
354colormatrix_int_ldu4:
355            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
356            uxtl        v12.8h, v12.8b
357            uxtl        v13.8h, v13.8b
358            uxtl        v14.8h, v14.8b
359            uxtl        v15.8h, v15.8b
360            br          x4
361
362.align 6
363colormatrix_float_ldu3:
364            ld4         {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32
365            uxtl        v20.8h, v20.8b
366            uxtl        v21.8h, v21.8b
367            uxtl        v22.8h, v22.8b
368            uxtl        v12.4s, v20.4h
369            uxtl        v13.4s, v21.4h
370            uxtl        v14.4s, v22.4h
371            uxtl2       v20.4s, v20.8h
372            uxtl2       v21.4s, v21.8h
373            uxtl2       v22.4s, v22.8h
374            ucvtf       v12.4s, v12.4s
375            ucvtf       v13.4s, v13.4s
376            ucvtf       v14.4s, v14.4s
377            ucvtf       v20.4s, v20.4s
378            ucvtf       v21.4s, v21.4s
379            ucvtf       v22.4s, v22.4s
380            br          x4
381
382colormatrix_int_ldu3:
383            ld4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32
384            uxtl        v12.8h, v12.8b
385            uxtl        v13.8h, v13.8b
386            uxtl        v14.8h, v14.8b
387            br          x4
388
389.align 5
390colormatrix_float_ldu1:
391            ld1         {v20.8b}, [x1], #8
392            uxtl        v20.8h, v20.8b
393            uxtl        v12.4s, v20.4h
394            uxtl2       v20.4s, v20.8h
395            ucvtf       v12.4s, v12.4s
396            ucvtf       v20.4s, v20.4s
397            br          x4
398
399.align 6
400colormatrix_float_ldu2:
401            ld2         {v20.8b,v21.8b}, [x1], #16
402            uxtl        v20.8h, v20.8b
403            uxtl        v21.8h, v21.8b
404            uxtl        v12.4s, v20.4h
405            uxtl        v13.4s, v21.4h
406            uxtl2       v20.4s, v20.8h
407            uxtl2       v21.4s, v21.8h
408            ucvtf       v12.4s, v12.4s
409            ucvtf       v13.4s, v13.4s
410            ucvtf       v20.4s, v20.4s
411            ucvtf       v21.4s, v21.4s
412            br          x4
413
414.align 4
415colormatrix_int_ldu2:
416            ld2         {v12.8b,v13.8b}, [x1], #16
417            uxtl        v12.8h, v12.8b
418            uxtl        v13.8h, v13.8b
419            br          x4
420
421.align 6
422colormatrix_float_stu4:
423            fcvtzs      v24.4s, v8.4s, #1
424            fcvtzs      v25.4s, v9.4s, #1
425            fcvtzs      v26.4s, v10.4s, #1
426            fcvtzs      v27.4s, v11.4s, #1
427            fcvtzs      v28.4s, v16.4s, #1
428            fcvtzs      v29.4s, v17.4s, #1
429            fcvtzs      v30.4s, v18.4s, #1
430            fcvtzs      v31.4s, v19.4s, #1
431            sqrshrun    v24.4h, v24.4s, #1
432            sqrshrun    v25.4h, v25.4s, #1
433            sqrshrun    v26.4h, v26.4s, #1
434            sqrshrun    v27.4h, v27.4s, #1
435            sqrshrun2   v24.8h, v28.4s, #1
436            sqrshrun2   v25.8h, v29.4s, #1
437            sqrshrun2   v26.8h, v30.4s, #1
438            sqrshrun2   v27.8h, v31.4s, #1
439            uqxtn       v24.8b, v24.8h
440            uqxtn       v25.8b, v25.8h
441            uqxtn       v26.8b, v26.8h
442            uqxtn       v27.8b, v27.8h
443            subs        x2, x2, #8
444            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
445            blo         colormatrix_float_end
446            br          x9
447
448.align 5
449colormatrix_int_stu4:
450            uqxtn       v12.8b, v8.8h
451            uqxtn       v13.8b, v9.8h
452            uqxtn       v14.8b, v10.8h
453            uqxtn       v15.8b, v11.8h
454            subs        x2, x2, #8
455            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
456            blo         colormatrix_int_end
457            br          x9
458
459.align 6
460colormatrix_float_stu3:
461            fcvtzs      v24.4s, v8.4s, #1
462            fcvtzs      v25.4s, v9.4s, #1
463            fcvtzs      v26.4s, v10.4s, #1
464            fcvtzs      v28.4s, v16.4s, #1
465            fcvtzs      v29.4s, v17.4s, #1
466            fcvtzs      v30.4s, v18.4s, #1
467            sqrshrun    v24.4h, v24.4s, #1
468            sqrshrun    v25.4h, v25.4s, #1
469            sqrshrun    v26.4h, v26.4s, #1
470            sqrshrun2   v24.8h, v28.4s, #1
471            sqrshrun2   v25.8h, v29.4s, #1
472            sqrshrun2   v26.8h, v30.4s, #1
473            uqxtn       v24.8b, v24.8h
474            uqxtn       v25.8b, v25.8h
475            uqxtn       v26.8b, v26.8h
476            movi        v27.8b, #0
477            subs        x2, x2, #8
478            st4         {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32
479            blo         colormatrix_float_end
480            br          x9
481
482.align 4
483colormatrix_int_ldu1:
484            ld1         {v12.8b}, [x1], #8
485            uxtl        v12.8h, v12.8b
486            br          x4
487
488.align 5
489colormatrix_int_stu3:
490            uqxtn       v12.8b, v8.8h
491            uqxtn       v13.8b, v9.8h
492            uqxtn       v14.8b, v10.8h
493            movi        v15.8b, #0
494            subs        x2, x2, #8
495            st4         {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32
496            blo         colormatrix_int_end
497            br          x9
498
499.align 6
500colormatrix_float_stu2:
501            fcvtzs      v24.4s, v8.4s, #1
502            fcvtzs      v25.4s, v9.4s, #1
503            fcvtzs      v28.4s, v16.4s, #1
504            fcvtzs      v29.4s, v17.4s, #1
505            sqrshrun    v24.4h, v24.4s, #1
506            sqrshrun    v25.4h, v25.4s, #1
507            sqrshrun2   v24.8h, v28.4s, #1
508            sqrshrun2   v25.8h, v29.4s, #1
509            uqxtn       v24.8b, v24.8h
510            uqxtn       v25.8b, v25.8h
511            subs        x2, x2, #8
512            st2         {v24.8b,v25.8b}, [x0], #16
513            blo         colormatrix_float_end
514            br          x9
515
516.align 5
517colormatrix_int_stu2:
518            uqxtn       v12.8b, v8.8h
519            uqxtn       v13.8b, v9.8h
520            subs        x2, x2, #8
521            st2         {v12.8b,v13.8b}, [x0], #16
522            blo         colormatrix_int_end
523            br          x9
524
525.align 5
526colormatrix_int_stu1:
527            uqxtn       v12.8b, v8.8h
528            subs        x2, x2, #8
529            st1         {v12.8b}, [x0], #8
530            blo         colormatrix_int_end
531            br          x9
532
533colormatrix_float_ldf3:
534            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
535            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
536            br          x4
537
538.align 6
539colormatrix_float_stu1:
540            fcvtzs      v24.4s, v8.4s, #1
541            fcvtzs      v28.4s, v16.4s, #1
542            sqrshrun    v24.4h, v24.4s, #1
543            sqrshrun2   v24.8h, v28.4s, #1
544            uqxtn       v24.8b, v24.8h
545            subs        x2, x2, #8
546            st1         {v24.8b}, [x0], #8
547            blo         colormatrix_float_end
548            br          x9
549
550colormatrix_float_stf3:
551            movi        v11.16b, #0
552            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
553            movi        v19.16b, #0
554            subs        x2, x2, #8
555            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
556            blo         colormatrix_float_end
557            br          x9
558
559.align 5
560colormatrix_float_stf4:
561            st4         {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64
562            subs        x2, x2, #8
563            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
564            blo         colormatrix_float_end
565            br          x9
566
567colormatrix_float_ldf4:
568            ld4         {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64
569            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
570            br          x4
571
572.align 5
573colormatrix_float_stf2:
574            st2         {v8.4s, v9.4s}, [x0], #32
575            subs        x2, x2, #8
576            st2         {v16.4s, v17.4s}, [x0], #32
577            blo         colormatrix_float_end
578            br          x9
579
580colormatrix_float_ldf2:
581            ld2         {v12.4s,v13.4s}, [x1], #32
582            ld2         {v20.4s,v21.4s}, [x1], #32
583            br          x4
584
585.align 5
586colormatrix_float_stf1:
587            st1         {v8.4s}, [x0], #16
588            subs        x2, x2, #8
589            st1         {v16.4s}, [x0], #16
590            blo         colormatrix_float_end
591            br          x9
592
593colormatrix_float_ldf1:
594            ld1         {v12.4s}, [x1], #16
595            ld1         {v20.4s}, [x1], #16
596            br          x4
597
598colormatrix_int_stu1_end:
599            uqxtn       v12.8b, v8.8h
600            tbz         x2, #2, 1f
601            st1         {v12.s}[1], [x0], #4
6021:          tbz         x2, #1, 1f
603            st1         {v12.h}[1], [x0], #2
6041:          tbz         x2, #0, 1f
605            st1         {v12.b}[1], [x0], #1
6061:          b           colormatrix_int_realend
607
608colormatrix_int_stu2_end:
609            uqxtn       v12.8b, v8.8h
610            uqxtn       v13.8b, v9.8h
611            zip1        v12.16b, v12.16b, v13.16b
612            tbz         x2, #2, 1f
613            st1         {v12.d}[1], [x0], #8
6141:          tbz         x2, #1, 1f
615            st1         {v12.s}[1], [x0], #4
6161:          tbz         x2, #0, 1f
617            st1         {v12.h}[1], [x0], #2
6181:          b           colormatrix_int_realend
619
620colormatrix_int_stu3_end:
621            uqxtn       v12.8b, v8.8h
622            uqxtn       v13.8b, v9.8h
623            uqxtn       v14.8b, v10.8h
624            movi        v15.8b, #0
625            tbz         x2, #2, 1f
626            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
627            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
628            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
629            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
6301:          tbz         x2, #1, 1f
631            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
632            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
6331:          tbz         x2, #0, 1f
634            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
6351:          b           colormatrix_int_realend
636
637colormatrix_int_stu4_end:
638            uqxtn       v12.8b, v8.8h
639            uqxtn       v13.8b, v9.8h
640            uqxtn       v14.8b, v10.8h
641            uqxtn       v15.8b, v11.8h
642            tbz         x2, #2, 1f
643            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
644            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
645            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
646            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
6471:          tbz         x2, #1, 1f
648            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
649            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
6501:          tbz         x2, #0, 1f
651            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
6521:          b           colormatrix_int_realend
653
654
655colormatrix_int_ldu1_end:
656            tbz         x2, #2, 1f
657            ld1         {v15.s}[3], [x1], #4
6581:          tbz         x2, #1, 1f
659            ld1         {v15.h}[5], [x1], #2
6601:          tbz         x2, #0, 1f
661            ld1         {v15.b}[9], [x1], #1
6621:          uxtl2       v12.8h, v15.16b
663            br          x4
664
665colormatrix_int_ldu2_end:
666            tbz         x2, #2, 1f
667            ld1         {v15.d}[1], [x1], #8
6681:          tbz         x2, #1, 1f
669            ld1         {v15.s}[1], [x1], #4
6701:          tbz         x2, #0, 1f
671            ld1         {v15.h}[1], [x1], #2
6721:          uzp1        v14.16b, v15.16b, v15.16b
673            uzp2        v15.16b, v15.16b, v15.16b
674            uxtl        v12.8h, v14.8b
675            uxtl        v13.8h, v15.8b
676            br          x4
677
678colormatrix_int_ldu3_end:
679            tbz         x2, #2, 1f
680            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
681            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
682            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
683            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
6841:          tbz         x2, #1, 1f
685            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
686            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
6871:          tbz         x2, #0, 1f
688            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
6891:          uxtl        v12.8h, v12.8b
690            uxtl        v13.8h, v13.8b
691            uxtl        v14.8h, v14.8b
692            br          x4
693
694colormatrix_int_ldu4_end:
695            tbz         x2, #2, 1f
696            ld4         {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4
697            ld4         {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4
698            ld4         {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4
699            ld4         {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4
7001:          tbz         x2, #1, 1f
701            ld4         {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4
702            ld4         {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4
7031:          tbz         x2, #0, 1f
704            ld4         {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4
7051:          uxtl        v12.8h, v12.8b
706            uxtl        v13.8h, v13.8b
707            uxtl        v14.8h, v14.8b
708            uxtl        v15.8h, v15.8b
709            br          x4
710
711colormatrix_float_stu1_end:
712            fcvtzs      v12.4s, v8.4s, #1
713            fcvtzs      v13.4s, v16.4s, #1
714            sqrshrun    v12.4h, v12.4s, #1
715            sqrshrun2   v12.8h, v13.4s, #1
716            uqxtn       v12.8b, v12.8h
717            tbz         x2, #2, 1f
718            st1         {v12.s}[1], [x0], #4
7191:          tbz         x2, #1, 1f
720            st1         {v12.h}[1], [x0], #2
7211:          tbz         x2, #0, 1f
722            st1         {v12.b}[1], [x0], #1
7231:          b           colormatrix_float_realend
724
725colormatrix_float_stu2_end:
726            fcvtzs      v12.4s, v8.4s, #1
727            fcvtzs      v13.4s, v9.4s, #1
728            fcvtzs      v14.4s, v16.4s, #1
729            fcvtzs      v15.4s, v17.4s, #1
730            sqrshrun    v12.4h, v12.4s, #1
731            sqrshrun    v13.4h, v13.4s, #1
732            sqrshrun    v14.4h, v14.4s, #1
733            sqrshrun    v15.4h, v15.4s, #1
734            zip1        v12.8h, v12.8h, v13.8h
735            zip1        v13.8h, v14.8h, v15.8h
736            uqxtn       v12.8b, v12.8h
737            uqxtn2      v12.16b, v13.8h
738            tbz         x2, #2, 1f
739            st1         {v12.d}[1], [x0], #8
7401:          tbz         x2, #1, 1f
741            st1         {v12.s}[1], [x0], #4
7421:          tbz         x2, #0, 1f
743            st1         {v12.h}[1], [x0], #2
7441:          b           colormatrix_float_realend
745
746colormatrix_float_stu3_end:
747            fcvtzs      v24.4s, v8.4s, #1
748            fcvtzs      v25.4s, v9.4s, #1
749            fcvtzs      v26.4s, v10.4s, #1
750            fcvtzs      v28.4s, v16.4s, #1
751            fcvtzs      v29.4s, v17.4s, #1
752            fcvtzs      v30.4s, v18.4s, #1
753            sqrshrun    v24.4h, v24.4s, #1
754            sqrshrun    v25.4h, v25.4s, #1
755            sqrshrun    v26.4h, v26.4s, #1
756            sqrshrun2   v24.8h, v28.4s, #1
757            sqrshrun2   v25.8h, v29.4s, #1
758            sqrshrun2   v26.8h, v30.4s, #1
759            uqxtn       v12.8b, v24.8h
760            uqxtn       v13.8b, v25.8h
761            uqxtn       v14.8b, v26.8h
762            movi        v15.8b, #0
763            tbz         x2, #2, 1f
764            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
765            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
766            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
767            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
7681:          tbz         x2, #1, 1f
769            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
770            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
7711:          tbz         x2, #0, 1f
772            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
7731:          b           colormatrix_float_realend
774
775colormatrix_float_stu4_end:
776            fcvtzs      v24.4s, v8.4s, #1
777            fcvtzs      v25.4s, v9.4s, #1
778            fcvtzs      v26.4s, v10.4s, #1
779            fcvtzs      v27.4s, v11.4s, #1
780            fcvtzs      v28.4s, v16.4s, #1
781            fcvtzs      v29.4s, v17.4s, #1
782            fcvtzs      v30.4s, v18.4s, #1
783            fcvtzs      v31.4s, v19.4s, #1
784            sqrshrun    v24.4h, v24.4s, #1
785            sqrshrun    v25.4h, v25.4s, #1
786            sqrshrun    v26.4h, v26.4s, #1
787            sqrshrun    v27.4h, v27.4s, #1
788            sqrshrun2   v24.8h, v28.4s, #1
789            sqrshrun2   v25.8h, v29.4s, #1
790            sqrshrun2   v26.8h, v30.4s, #1
791            sqrshrun2   v27.8h, v31.4s, #1
792            uqxtn       v12.8b, v24.8h
793            uqxtn       v13.8b, v25.8h
794            uqxtn       v14.8b, v26.8h
795            uqxtn       v15.8b, v27.8h
796            tbz         x2, #2, 1f
797            st4         {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4
798            st4         {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4
799            st4         {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4
800            st4         {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4
8011:          tbz         x2, #1, 1f
802            st4         {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4
803            st4         {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4
8041:          tbz         x2, #0, 1f
805            st4         {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4
8061:          b           colormatrix_float_realend
807
808colormatrix_float_stf1_end:
809            tbz         x2, #2, 1f
810            st1         {v16.4s}, [x0], #16
8111:          tbz         x2, #1, 1f
812            st1         {v8.d}[1], [x0], #8
8131:          tbz         x2, #0, 1f
814            st1         {v8.s}[1], [x0], #4
8151:          b           colormatrix_float_realend
816
817colormatrix_float_stf2_end:
818            tbz         x2, #2, 1f
819            st2         {v16.4s, v17.4s}, [x0], #32
8201:          tbz         x2, #1, 1f
821            st2         {v8.s,v9.s}[2], [x0], #8
822            st2         {v8.s,v9.s}[3], [x0], #8
8231:          tbz         x2, #0, 1f
824            st2         {v8.s,v9.s}[1], [x0], #8
8251:          b           colormatrix_float_realend
826
827colormatrix_float_stf3_end:
828            movi        v11.16b, #0
829            movi        v19.16b, #0
830colormatrix_float_stf4_end:
831            tbz         x2, #2, 1f
832            st4         {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64
8331:          tbz         x2, #1, 1f
834            st4         {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16
835            st4         {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16
8361:          tbz         x2, #0, 1f
837            st4         {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16
8381:          b           colormatrix_float_realend
839
840colormatrix_float_ldu1_end:
841            tbz         x2, #2, 1f
842            ld1         {v15.s}[1], [x1], #4
8431:          tbz         x2, #1, 1f
844            ld1         {v15.h}[1], [x1], #2
8451:          tbz         x2, #0, 1f
846            ld1         {v15.b}[1], [x1], #1
8471:          uxtl        v15.8h, v15.8b
848            uxtl        v12.4s, v15.4h
849            uxtl2       v20.4s, v15.8h
850            ucvtf       v12.4s, v12.4s
851            ucvtf       v20.4s, v20.4s
852            br          x4
853
854colormatrix_float_ldu2_end:
855            tbz         x2, #2, 1f
856            ld1         {v15.d}[1], [x1], #8
8571:          tbz         x2, #1, 1f
858            ld1         {v15.s}[1], [x1], #4
8591:          tbz         x2, #0, 1f
860            ld1         {v15.h}[1], [x1], #2
8611:          uxtl        v14.8h, v15.8b
862            uxtl2       v15.8h, v15.16b
863            uzp1        v12.8h, v14.8h, v14.8h
864            uzp2        v13.8h, v14.8h, v14.8h
865            uzp1        v20.8h, v15.8h, v15.8h
866            uzp2        v21.8h, v15.8h, v15.8h
867            uxtl        v12.4s, v12.4h
868            uxtl        v13.4s, v13.4h
869            uxtl        v20.4s, v20.4h
870            uxtl        v21.4s, v21.4h
871            ucvtf       v12.4s, v12.4s
872            ucvtf       v13.4s, v13.4s
873            ucvtf       v20.4s, v20.4s
874            ucvtf       v21.4s, v21.4s
875            br          x4
876
877colormatrix_float_ldu3_end:
878            tbz         x2, #2, 1f
879            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
880            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
881            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
882            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
8831:          tbz         x2, #1, 1f
884            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
885            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
8861:          tbz         x2, #0, 1f
887            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
8881:          uxtl        v20.8h, v20.8b
889            uxtl        v21.8h, v21.8b
890            uxtl        v22.8h, v22.8b
891            uxtl        v12.4s, v20.4h
892            uxtl        v13.4s, v21.4h
893            uxtl        v14.4s, v22.4h
894            uxtl2       v20.4s, v20.8h
895            uxtl2       v21.4s, v21.8h
896            uxtl2       v22.4s, v22.8h
897            ucvtf       v12.4s, v12.4s
898            ucvtf       v13.4s, v13.4s
899            ucvtf       v14.4s, v14.4s
900            ucvtf       v20.4s, v20.4s
901            ucvtf       v21.4s, v21.4s
902            ucvtf       v22.4s, v22.4s
903            br          x4
904
905colormatrix_float_ldu4_end:
906            tbz         x2, #2, 1f
907            ld4         {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4
908            ld4         {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4
909            ld4         {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4
910            ld4         {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4
9111:          tbz         x2, #1, 1f
912            ld4         {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4
913            ld4         {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4
9141:          tbz         x2, #0, 1f
915            ld4         {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4
9161:          uxtl        v20.8h, v20.8b
917            uxtl        v21.8h, v21.8b
918            uxtl        v22.8h, v22.8b
919            uxtl        v23.8h, v23.8b
920            uxtl        v12.4s, v20.4h
921            uxtl        v13.4s, v21.4h
922            uxtl        v14.4s, v22.4h
923            uxtl        v15.4s, v23.4h
924            uxtl2       v20.4s, v20.8h
925            uxtl2       v21.4s, v21.8h
926            uxtl2       v22.4s, v22.8h
927            uxtl2       v23.4s, v23.8h
928            ucvtf       v12.4s, v12.4s
929            ucvtf       v13.4s, v13.4s
930            ucvtf       v14.4s, v14.4s
931            ucvtf       v15.4s, v15.4s
932            ucvtf       v20.4s, v20.4s
933            ucvtf       v21.4s, v21.4s
934            ucvtf       v22.4s, v22.4s
935            ucvtf       v23.4s, v23.4s
936            br          x4
937
938colormatrix_float_ldf1_end:
939            tbz         x2, #2, 1f
940            ld1         {v20.4s}, [x1], #16
9411:          tbz         x2, #1, 1f
942            ld1         {v12.d}[1], [x1], #8
9431:          tbz         x2, #0, 1f
944            ld1         {v12.s}[1], [x1], #4
9451:          br          x4
946
947colormatrix_float_ldf2_end:
948            tbz         x2, #2, 1f
949            ld2         {v20.4s,v21.4s}, [x1], #32
9501:          tbz         x2, #1, 1f
951            ld2         {v12.s,v13.s}[2], [x1], #8
952            ld2         {v12.s,v13.s}[3], [x1], #8
9531:          tbz         x2, #0, 1f
954            ld2         {v12.s,v13.s}[1], [x1], #8
9551:          br          x4
956
957colormatrix_float_ldf3_end:
958colormatrix_float_ldf4_end:
959            tbz         x2, #2, 1f
960            ld4         {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64
9611:          tbz         x2, #1, 1f
962            ld4         {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16
963            ld4         {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16
9641:          tbz         x2, #0, 1f
965            ld4         {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16
9661:          br          x4
967
968/* void rsdIntrinsicColorMatrix_int_K(
969 *          void *out,              // x0
970 *          void const *in,         // x1
971 *          size_t count,           // x2
972 *          fntab_t const *fns,     // x3
973 *          int16_t const *mult,    // x4
974 *          int32_t const *add);    // x5
975 */
976ENTRY(rsdIntrinsicColorMatrix_int_K)
977            sub         x7, sp, #32
978            sub         sp, sp, #64
979            st1         {v8.1d-v11.1d}, [sp]
980            st1         {v12.1d-v15.1d}, [x7]
981
982            ld1         {v0.8h,v1.8h}, [x4], #32
983            ld1         {v4.4s}, [x5], #16
984
985            ldp         x4,x5, [x3],#16
986            ldp         x6,x7, [x3],#16
987            ldp         x8,x9, [x3],#16
988
989            dup         v12.4s, v4.s[0]
990            dup         v13.4s, v4.s[1]
991            dup         v14.4s, v4.s[2]
992            dup         v15.4s, v4.s[3]
993            sqshrun     v8.4h, v12.4s, #8
994            sqshrun2    v8.8h, v12.4s, #8
995            sqshrun     v9.4h, v13.4s, #8
996            sqshrun2    v9.8h, v13.4s, #8
997            sqshrun     v10.4h, v14.4s, #8
998            sqshrun2    v10.8h, v14.4s, #8
999            sqshrun     v11.4h, v15.4s, #8
1000            sqshrun2    v11.8h, v15.4s, #8
1001
1002            subs        x2, x2, #8
1003            blo         colormatrix_int_end
1004            br          x9
1005
1006colormatrix_int_end:
1007            adds        x2, x2, #8
1008            bls         colormatrix_int_realend
1009            mov         x16, x8
1010            ldp         x8, x9, [x3], #16
1011            cmp         x4, x16
1012            csel        x4, x8, x4, eq
1013            cmp         x5, x16
1014            csel        x5, x8, x5, eq
1015            cmp         x6, x16
1016            csel        x6, x8, x6, eq
1017            cmp         x7, x16
1018            csel        x7, x8, x7, eq
1019            br          x9
1020
1021colormatrix_int_realend:
1022            ld1         {v8.1d-v11.1d}, [sp], #32
1023            ld1         {v12.1d-v15.1d}, [sp], #32
1024            ret
1025END(rsdIntrinsicColorMatrix_int_K)
1026
1027/* void rsdIntrinsicColorMatrixSetup_int_K(
1028 *          fntab_t const *fns, // x0
1029 *          uint32_t mask,      // x1
1030 *          int dt,             // x2
1031 *          int st);            // x3
1032 */
1033ENTRY(rsdIntrinsicColorMatrixSetup_int_K)
1034            adrp        x7, 2f
1035            add         x7, x7, :lo12:2f
1036            add         x4, x7, x2, LSL #3
1037            ldrsw       x2, [x4], #4
1038            ldrsw       x4, [x4]
1039            add         x2, x2, x7
1040            add         x4, x4, x7
1041            adrp        x7, 3f
1042            add         x7, x7, :lo12:3f
1043            add         x5, x7, x3, LSL #3
1044            ldrsw       x3, [x5], #4
1045            ldrsw       x5, [x5]
1046            add         x3, x3, x7
1047            add         x5, x5, x7
1048            stp         x2, x3, [x0, #32]
1049            stp         x4, x5, [x0, #48]
1050
1051/* For each column function, if the matrix is all zeroes then write NULL,
1052 * otherwise look up the appropriate function and store that. */
1053
1054            mov         x3, #4
1055            adrp        x7, 4f
1056            add         x7, x7, :lo12:4f
10571:          ands        x2, x1, #15
1058            beq         9f
1059            and         x2, x1, #31
1060            lsl         x2, x2, #4
1061            ldrsw       x2, [x7, x2]
1062            add         x2, x2, x7
10639:          str         x2, [x0], #8
1064            lsr         x1, x1, #5
1065            add         x7, x7, #4
1066            subs        x3, x3, #1
1067            bne         1b
1068
1069/* For every NULL entry, copy the non-NULL entry that follows it, or the store
1070 * function. */
1071
1072            ldr         x2, [x0]
1073            mov         x3, #4
10741:          ldr         x1, [x0, #-8]!
1075            cmp         x1, #0
1076            csel        x2, x1, x2, ne
1077            str         x2, [x0]
1078            subs        x3, x3, #1
1079            bne         1b
1080            ret
1081
1082END(rsdIntrinsicColorMatrixSetup_int_K)
1083.rodata
1084            .align 4
10852:          .word      colormatrix_int_stu1-2b
1086            .word      colormatrix_int_stu1_end-2b
1087            .word      colormatrix_int_stu2-2b
1088            .word      colormatrix_int_stu2_end-2b
1089            .word      colormatrix_int_stu3-2b
1090            .word      colormatrix_int_stu3_end-2b
1091            .word      colormatrix_int_stu4-2b
1092            .word      colormatrix_int_stu4_end-2b
10933:          .word      colormatrix_int_ldu1-3b
1094            .word      colormatrix_int_ldu1_end-3b
1095            .word      colormatrix_int_ldu2-3b
1096            .word      colormatrix_int_ldu2_end-3b
1097            .word      colormatrix_int_ldu3-3b
1098            .word      colormatrix_int_ldu3_end-3b
1099            .word      colormatrix_int_ldu4-3b
1100            .word      colormatrix_int_ldu4_end-3b
11014:
1102.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1103            .word      colormatrix_int_col0_\i-4b
1104            .word      colormatrix_int_col1_\i-4b-4
1105            .word      colormatrix_int_col2_\i-4b-8
1106            .word      colormatrix_int_col3_\i-4b-12
1107.endr
1108.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1109            .word      colormatrix_int_col0_n\i-4b
1110            .word      colormatrix_int_col1_n\i-4b-4
1111            .word      colormatrix_int_col2_n\i-4b-8
1112            .word      colormatrix_int_col3_n\i-4b-12
1113.endr
1114
1115
1116/* void rsdIntrinsicColorMatrix_float_K(
1117 *          void *out,              // x0
1118 *          void const *in,         // x1
1119 *          size_t count,           // x2
1120 *          fntab_t const *fns,     // x3
1121 *          float const *mult,      // x4
1122 *          float const *add);      // x5
1123 */
1124ENTRY(rsdIntrinsicColorMatrix_float_K)
1125            sub         x7, sp, #32
1126            sub         sp, sp, #64
1127            st1         {v8.1d-v11.1d}, [sp]
1128            st1         {v12.1d-v15.1d}, [x7]
1129
1130            ld1         {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64
1131            ld1r        {v4.4s}, [x5], #4
1132            ld1r        {v5.4s}, [x5], #4
1133            ld1r        {v6.4s}, [x5], #4
1134            ld1r        {v7.4s}, [x5], #4
1135
1136            ldp         x4,x5, [x3], #16
1137            ldp         x6,x7, [x3], #16
1138            ldp         x8,x9, [x3], #16
1139
1140            mov         v8.16b, v4.16b
1141            mov         v9.16b, v5.16b
1142            mov         v10.16b, v6.16b
1143            mov         v11.16b, v7.16b
1144
1145            mov         v16.16b, v4.16b
1146            mov         v17.16b, v5.16b
1147            mov         v18.16b, v6.16b
1148            mov         v19.16b, v7.16b
1149
1150            subs        x2, x2, #8
1151            blo         colormatrix_float_end
1152            br          x9
1153
1154colormatrix_float_end:
1155            adds        x2, x2, #8
1156            bls         colormatrix_int_realend
1157            mov         x16, x8
1158            ldp         x8,x9, [x3], #16
1159            cmp         x4, x16
1160            csel        x4, x8, x4, eq
1161            cmp         x5, x16
1162            csel        x5, x8, x5, eq
1163            cmp         x6, x16
1164            csel        x6, x8, x6, eq
1165            cmp         x7, x16
1166            csel        x7, x8, x7, eq
1167            br          x9
1168
1169colormatrix_float_realend:
1170            ld1         {v8.1d-v11.1d}, [sp], #32
1171            ld1         {v12.1d-v15.1d}, [sp], #32
1172            ret
1173END(rsdIntrinsicColorMatrix_float_K)
1174
1175/* void rsdIntrinsicColorMatrixSetup_float_K(
1176 *          fntab_t const *fns, // x0
1177 *          uint32_t mask,      // x1
1178 *          int dt,             // x2
1179 *          int st);            // x3
1180 */
1181ENTRY(rsdIntrinsicColorMatrixSetup_float_K)
1182            adrp        x7, 2f
1183            add         x7, x7, :lo12:2f
1184            add         x4, x7, x2, LSL #3
1185            ldrsw       x2, [x4], #4
1186            ldrsw       x4, [x4]
1187            add         x2, x2, x7
1188            add         x4, x4, x7
1189            adrp        x7, 3f
1190            add         x7, x7, :lo12:3f
1191            add         x5, x7, x3, LSL #3
1192            ldrsw       x3, [x5], #4
1193            ldrsw       x5, [x5]
1194            add         x3, x3, x7
1195            add         x5, x5, x7
1196            stp         x2, x3, [x0, #32]
1197            stp         x4, x5, [x0, #48]
1198
1199/* For each column function, if the matrix is all zeroes then write NULL,
1200 * otherwise look up the appropriate function and store that. */
1201
1202            mov         x3, #4
1203            adrp        x7, 4f
1204            add         x7, x7, :lo12:4f
12051:          ands        x2, x1, #15
1206            beq         9f
1207            and         x2, x1, #31
1208            lsl         x2, x2, #4
1209            ldrsw       x2, [x7, x2]
1210            add         x2, x2, x7
12119:          str         x2, [x0], #8
1212            lsr         x1, x1, #5
1213            add         x7, x7, #4
1214            subs        x3, x3, #1
1215            bne         1b
1216
1217/* For every NULL entry, copy the non-NULL entry that follows it, or the store
1218 * function. */
1219
1220            ldr         x2, [x0]
1221            mov         x3, #4
12221:          ldr         x1, [x0, #-8]!
1223            cmp         x1, #0
1224            csel        x2, x1, x2, ne
1225            str         x2, [x0]
1226            subs        x3, x3, #1
1227            bne         1b
1228            ret
1229
1230END(rsdIntrinsicColorMatrixSetup_float_K)
1231.rodata
1232            .align 4
12332:          .word      colormatrix_float_stu1-2b
1234            .word      colormatrix_float_stu1_end-2b
1235            .word      colormatrix_float_stu2-2b
1236            .word      colormatrix_float_stu2_end-2b
1237            .word      colormatrix_float_stu3-2b
1238            .word      colormatrix_float_stu3_end-2b
1239            .word      colormatrix_float_stu4-2b
1240            .word      colormatrix_float_stu4_end-2b
1241            .word      colormatrix_float_stf1-2b
1242            .word      colormatrix_float_stf1_end-2b
1243            .word      colormatrix_float_stf2-2b
1244            .word      colormatrix_float_stf2_end-2b
1245            .word      colormatrix_float_stf3-2b
1246            .word      colormatrix_float_stf3_end-2b
1247            .word      colormatrix_float_stf4-2b
1248            .word      colormatrix_float_stf4_end-2b
12493:          .word      colormatrix_float_ldu1-3b
1250            .word      colormatrix_float_ldu1_end-3b
1251            .word      colormatrix_float_ldu2-3b
1252            .word      colormatrix_float_ldu2_end-3b
1253            .word      colormatrix_float_ldu3-3b
1254            .word      colormatrix_float_ldu3_end-3b
1255            .word      colormatrix_float_ldu4-3b
1256            .word      colormatrix_float_ldu4_end-3b
1257            .word      colormatrix_float_ldf1-3b
1258            .word      colormatrix_float_ldf1_end-3b
1259            .word      colormatrix_float_ldf2-3b
1260            .word      colormatrix_float_ldf2_end-3b
1261            .word      colormatrix_float_ldf3-3b
1262            .word      colormatrix_float_ldf3_end-3b
1263            .word      colormatrix_float_ldf4-3b
1264            .word      colormatrix_float_ldf4_end-3b
12654:
1266.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1267            .word      colormatrix_float_col0_\i-4b
1268            .word      colormatrix_float_col1_\i-4b-4
1269            .word      colormatrix_float_col2_\i-4b-8
1270            .word      colormatrix_float_col3_\i-4b-12
1271.endr
1272.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
1273            .word      colormatrix_float_col0_n\i-4b
1274            .word      colormatrix_float_col1_n\i-4b-4
1275            .word      colormatrix_float_col2_n\i-4b-8
1276            .word      colormatrix_float_col3_n\i-4b-12
1277.endr
1278