1/* 2 * Copyright (C) 2014 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: 18#define END(f) .size f, .-f; 19 20 21.macro vmxx_f32 i, mask, opd, opa, opb 22 .if (\i) & \mask 23 .if (\i) & (\mask - 1) 24 fmla \opd, \opa, \opb 25 .else 26 fmul \opd, \opa, \opb 27 .endif 28 .endif 29.endm 30 31.macro vadd_f32 i, mask, opd, opa, opb, stupidsyntax1, stupidsyntax2 32 .if (\i) & \mask 33 .if (\i) & (\mask - 1) 34 fadd \opd, \opa, \opb 35 .else 36 mov \stupidsyntax1, \stupidsyntax2 37 .endif 38 .endif 39.endm 40 41.macro vmxx_s16 i, mask, opd, opa, opb 42 .if (\i) & \mask 43 .if (\i) & (\mask - 1 + 16) 44 smlal \opd, \opa, \opb 45 .else 46 smull \opd, \opa, \opb 47 .endif 48 .endif 49.endm 50 51.macro vmxx2_s16 i, mask, opd, opa, opb 52 .if (\i) & \mask 53 .if (\i) & (\mask - 1 + 16) 54 smlal2 \opd, \opa, \opb 55 .else 56 smull2 \opd, \opa, \opb 57 .endif 58 .endif 59.endm 60 61/* x0 = dst 62 * x1 = src 63 * x2 = count 64 * x3 = params 65 * x4 = column0_fn 66 * x5 = column1_fn 67 * x6 = column2_fn 68 * x7 = column3_fn 69 * x8 = store_fn 70 * x9 = load_fn 71 */ 72.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 73 74.align 6 75colormatrix_int_col0_\i: 76 .if \i & 16 77 dup v6.4s, v4.s[0] 78 dup v7.4s, v4.s[0] 79 .endif 80 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[0] 81 vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[4] 82 vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[0] 83 vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[4] 84 vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[0] 85 vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[4] 86 vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[0] 87 vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[4] 88 sqshrun v8.4h, v6.4s, #8 89 sqshrun2 v8.8h, v7.4s, #8 90 br x5 91 92colormatrix_int_col0_n\i: 93 .if (\i^31) & 16 94 dup v6.4s, v4.s[0] 95 dup v7.4s, v4.s[0] 96 .endif 97 vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[0] 98 vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[4] 99 vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[0] 100 vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[4] 101 vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[0] 102 vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[4] 103 vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[0] 104 vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[4] 105 sqshrun v8.4h, v6.4s, #8 106 sqshrun2 v8.8h, v7.4s, #8 107 br x5 108 109.align 6 110colormatrix_int_col1_\i: 111 .if \i & 16 112 dup v6.4s, v4.s[1] 113 dup v7.4s, v4.s[1] 114 .endif 115 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[1] 116 vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[5] 117 vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[1] 118 vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[5] 119 vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[1] 120 vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[5] 121 vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[1] 122 vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[5] 123 sqshrun v9.4h, v6.4s, #8 124 sqshrun2 v9.8h, v7.4s, #8 125 br x6 126 127colormatrix_int_col1_n\i: 128 .if (\i^31) & 16 129 dup v6.4s, v4.s[1] 130 dup v7.4s, v4.s[1] 131 .endif 132 vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[1] 133 vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[5] 134 vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[1] 135 vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[5] 136 vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[1] 137 vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[5] 138 vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[1] 139 vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[5] 140 sqshrun v9.4h, v6.4s, #8 141 sqshrun2 v9.8h, v7.4s, #8 142 br x6 143 144.align 6 145colormatrix_int_col2_\i: 146 .if \i & 16 147 dup v6.4s, v4.s[2] 148 dup v7.4s, v4.s[2] 149 .endif 150 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[2] 151 vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[6] 152 vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[2] 153 vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[6] 154 vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[2] 155 vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[6] 156 vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[2] 157 vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[6] 158 sqshrun v10.4h, v6.4s, #8 159 sqshrun2 v10.8h, v7.4s, #8 160 br x7 161 162colormatrix_int_col2_n\i: 163 .if (\i^31) & 16 164 dup v6.4s, v4.s[2] 165 dup v7.4s, v4.s[2] 166 .endif 167 vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[2] 168 vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[6] 169 vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[2] 170 vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[6] 171 vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[2] 172 vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[6] 173 vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[2] 174 vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[6] 175 sqshrun v10.4h, v6.4s, #8 176 sqshrun2 v10.8h, v7.4s, #8 177 br x7 178 179.align 6 180colormatrix_int_col3_\i: 181 .if \i & 16 182 dup v6.4s, v4.s[3] 183 dup v7.4s, v4.s[3] 184 .endif 185 vmxx_s16 \i, 1, v6.4s, v12.4h, v0.h[3] 186 vmxx_s16 \i, 2, v6.4s, v13.4h, v0.h[7] 187 vmxx_s16 \i, 4, v6.4s, v14.4h, v1.h[3] 188 vmxx_s16 \i, 8, v6.4s, v15.4h, v1.h[7] 189 vmxx2_s16 \i, 1, v7.4s, v12.8h, v0.h[3] 190 vmxx2_s16 \i, 2, v7.4s, v13.8h, v0.h[7] 191 vmxx2_s16 \i, 4, v7.4s, v14.8h, v1.h[3] 192 vmxx2_s16 \i, 8, v7.4s, v15.8h, v1.h[7] 193 sqshrun v11.4h, v6.4s, #8 194 sqshrun2 v11.8h, v7.4s, #8 195 br x8 196 197colormatrix_int_col3_n\i: 198 .if (\i^31) & 16 199 dup v6.4s, v4.s[3] 200 dup v7.4s, v4.s[3] 201 .endif 202 vmxx_s16 \i^31, 1, v6.4s, v12.4h, v0.h[3] 203 vmxx_s16 \i^31, 2, v6.4s, v13.4h, v0.h[7] 204 vmxx_s16 \i^31, 4, v6.4s, v14.4h, v1.h[3] 205 vmxx_s16 \i^31, 8, v6.4s, v15.4h, v1.h[7] 206 vmxx2_s16 \i^31, 1, v7.4s, v12.8h, v0.h[3] 207 vmxx2_s16 \i^31, 2, v7.4s, v13.8h, v0.h[7] 208 vmxx2_s16 \i^31, 4, v7.4s, v14.8h, v1.h[3] 209 vmxx2_s16 \i^31, 8, v7.4s, v15.8h, v1.h[7] 210 sqshrun v11.4h, v6.4s, #8 211 sqshrun2 v11.8h, v7.4s, #8 212 br x8 213 214.align 5 215colormatrix_float_col0_\i: 216 vmxx_f32 \i, 1, v8.4s, v12.4s, v0.s[0] 217 vmxx_f32 \i, 2, v8.4s, v13.4s, v1.s[0] 218 vmxx_f32 \i, 4, v8.4s, v14.4s, v2.s[0] 219 vmxx_f32 \i, 8, v8.4s, v15.4s, v3.s[0] 220 vadd_f32 \i, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b 221 vmxx_f32 \i, 1, v16.4s, v20.4s, v0.s[0] 222 vmxx_f32 \i, 2, v16.4s, v21.4s, v1.s[0] 223 vmxx_f32 \i, 4, v16.4s, v22.4s, v2.s[0] 224 vmxx_f32 \i, 8, v16.4s, v23.4s, v3.s[0] 225 vadd_f32 \i, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b 226 br x5 227 228.align 4 229colormatrix_float_col0_n\i: 230 vmxx_f32 \i^31, 1, v8.4s, v12.4s, v0.s[0] 231 vmxx_f32 \i^31, 2, v8.4s, v13.4s, v1.s[0] 232 vmxx_f32 \i^31, 4, v8.4s, v14.4s, v2.s[0] 233 vmxx_f32 \i^31, 8, v8.4s, v15.4s, v3.s[0] 234 vadd_f32 \i^31, 16, v8.4s, v8.4s, v4.4s, v8.16b, v4.16b 235 vmxx_f32 \i^31, 1, v16.4s, v20.4s, v0.s[0] 236 vmxx_f32 \i^31, 2, v16.4s, v21.4s, v1.s[0] 237 vmxx_f32 \i^31, 4, v16.4s, v22.4s, v2.s[0] 238 vmxx_f32 \i^31, 8, v16.4s, v23.4s, v3.s[0] 239 vadd_f32 \i^31, 16, v16.4s, v16.4s, v4.4s, v16.16b, v4.16b 240 br x5 241 242.align 5 243colormatrix_float_col1_\i: 244 vmxx_f32 \i, 1, v9.4s, v12.4s, v0.s[1] 245 vmxx_f32 \i, 2, v9.4s, v13.4s, v1.s[1] 246 vmxx_f32 \i, 4, v9.4s, v14.4s, v2.s[1] 247 vmxx_f32 \i, 8, v9.4s, v15.4s, v3.s[1] 248 vadd_f32 \i, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b 249 vmxx_f32 \i, 1, v17.4s, v20.4s, v0.s[1] 250 vmxx_f32 \i, 2, v17.4s, v21.4s, v1.s[1] 251 vmxx_f32 \i, 4, v17.4s, v22.4s, v2.s[1] 252 vmxx_f32 \i, 8, v17.4s, v23.4s, v3.s[1] 253 vadd_f32 \i, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b 254 br x6 255 256.align 4 257colormatrix_float_col1_n\i: 258 vmxx_f32 \i^31, 1, v9.4s, v12.4s, v0.s[1] 259 vmxx_f32 \i^31, 2, v9.4s, v13.4s, v1.s[1] 260 vmxx_f32 \i^31, 4, v9.4s, v14.4s, v2.s[1] 261 vmxx_f32 \i^31, 8, v9.4s, v15.4s, v3.s[1] 262 vadd_f32 \i^31, 16, v9.4s, v9.4s, v5.4s, v9.16b, v5.16b 263 vmxx_f32 \i^31, 1, v17.4s, v20.4s, v0.s[1] 264 vmxx_f32 \i^31, 2, v17.4s, v21.4s, v1.s[1] 265 vmxx_f32 \i^31, 4, v17.4s, v22.4s, v2.s[1] 266 vmxx_f32 \i^31, 8, v17.4s, v23.4s, v3.s[1] 267 vadd_f32 \i^31, 16, v17.4s, v17.4s, v5.4s, v17.16b, v5.16b 268 br x6 269 270.align 5 271colormatrix_float_col2_\i: 272 vmxx_f32 \i, 1, v10.4s, v12.4s, v0.s[2] 273 vmxx_f32 \i, 2, v10.4s, v13.4s, v1.s[2] 274 vmxx_f32 \i, 4, v10.4s, v14.4s, v2.s[2] 275 vmxx_f32 \i, 8, v10.4s, v15.4s, v3.s[2] 276 vadd_f32 \i, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b 277 vmxx_f32 \i, 1, v18.4s, v20.4s, v0.s[2] 278 vmxx_f32 \i, 2, v18.4s, v21.4s, v1.s[2] 279 vmxx_f32 \i, 4, v18.4s, v22.4s, v2.s[2] 280 vmxx_f32 \i, 8, v18.4s, v23.4s, v3.s[2] 281 vadd_f32 \i, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b 282 br x7 283 284.align 4 285colormatrix_float_col2_n\i: 286 vmxx_f32 \i^31, 1, v10.4s, v12.4s, v0.s[2] 287 vmxx_f32 \i^31, 2, v10.4s, v13.4s, v1.s[2] 288 vmxx_f32 \i^31, 4, v10.4s, v14.4s, v2.s[2] 289 vmxx_f32 \i^31, 8, v10.4s, v15.4s, v3.s[2] 290 vadd_f32 \i^31, 16, v10.4s, v10.4s, v6.4s, v10.16b, v6.16b 291 vmxx_f32 \i^31, 1, v18.4s, v20.4s, v0.s[2] 292 vmxx_f32 \i^31, 2, v18.4s, v21.4s, v1.s[2] 293 vmxx_f32 \i^31, 4, v18.4s, v22.4s, v2.s[2] 294 vmxx_f32 \i^31, 8, v18.4s, v23.4s, v3.s[2] 295 vadd_f32 \i^31, 16, v18.4s, v18.4s, v6.4s, v18.16b, v6.16b 296 br x7 297 298.align 5 299colormatrix_float_col3_\i: 300 vmxx_f32 \i, 1, v11.4s, v12.4s, v0.s[3] 301 vmxx_f32 \i, 2, v11.4s, v13.4s, v1.s[3] 302 vmxx_f32 \i, 4, v11.4s, v14.4s, v2.s[3] 303 vmxx_f32 \i, 8, v11.4s, v15.4s, v3.s[3] 304 vadd_f32 \i, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b 305 vmxx_f32 \i, 1, v19.4s, v20.4s, v0.s[3] 306 vmxx_f32 \i, 2, v19.4s, v21.4s, v1.s[3] 307 vmxx_f32 \i, 4, v19.4s, v22.4s, v2.s[3] 308 vmxx_f32 \i, 8, v19.4s, v23.4s, v3.s[3] 309 vadd_f32 \i, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b 310 br x8 311 312.align 4 313colormatrix_float_col3_n\i: 314 vmxx_f32 \i^31, 1, v11.4s, v12.4s, v0.s[3] 315 vmxx_f32 \i^31, 2, v11.4s, v13.4s, v1.s[3] 316 vmxx_f32 \i^31, 4, v11.4s, v14.4s, v2.s[3] 317 vmxx_f32 \i^31, 8, v11.4s, v15.4s, v3.s[3] 318 vadd_f32 \i^31, 16, v11.4s, v11.4s, v7.4s, v11.16b, v7.16b 319 vmxx_f32 \i^31, 1, v19.4s, v20.4s, v0.s[3] 320 vmxx_f32 \i^31, 2, v19.4s, v21.4s, v1.s[3] 321 vmxx_f32 \i^31, 4, v19.4s, v22.4s, v2.s[3] 322 vmxx_f32 \i^31, 8, v19.4s, v23.4s, v3.s[3] 323 vadd_f32 \i^31, 16, v19.4s, v19.4s, v7.4s, v19.16b, v7.16b 324 br x8 325 326.endr 327 328.align 6 329colormatrix_float_ldu4: 330 ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32 331 uxtl v20.8h, v20.8b 332 uxtl v21.8h, v21.8b 333 uxtl v22.8h, v22.8b 334 uxtl v23.8h, v23.8b 335 uxtl v12.4s, v20.4h 336 uxtl v13.4s, v21.4h 337 uxtl v14.4s, v22.4h 338 uxtl v15.4s, v23.4h 339 uxtl2 v20.4s, v20.8h 340 uxtl2 v21.4s, v21.8h 341 uxtl2 v22.4s, v22.8h 342 uxtl2 v23.4s, v23.8h 343 ucvtf v12.4s, v12.4s 344 ucvtf v13.4s, v13.4s 345 ucvtf v14.4s, v14.4s 346 ucvtf v15.4s, v15.4s 347 ucvtf v20.4s, v20.4s 348 ucvtf v21.4s, v21.4s 349 ucvtf v22.4s, v22.4s 350 ucvtf v23.4s, v23.4s 351 br x4 352 353.align 5 354colormatrix_int_ldu4: 355 ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32 356 uxtl v12.8h, v12.8b 357 uxtl v13.8h, v13.8b 358 uxtl v14.8h, v14.8b 359 uxtl v15.8h, v15.8b 360 br x4 361 362.align 6 363colormatrix_float_ldu3: 364 ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [x1], #32 365 uxtl v20.8h, v20.8b 366 uxtl v21.8h, v21.8b 367 uxtl v22.8h, v22.8b 368 uxtl v12.4s, v20.4h 369 uxtl v13.4s, v21.4h 370 uxtl v14.4s, v22.4h 371 uxtl2 v20.4s, v20.8h 372 uxtl2 v21.4s, v21.8h 373 uxtl2 v22.4s, v22.8h 374 ucvtf v12.4s, v12.4s 375 ucvtf v13.4s, v13.4s 376 ucvtf v14.4s, v14.4s 377 ucvtf v20.4s, v20.4s 378 ucvtf v21.4s, v21.4s 379 ucvtf v22.4s, v22.4s 380 br x4 381 382colormatrix_int_ldu3: 383 ld4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x1], #32 384 uxtl v12.8h, v12.8b 385 uxtl v13.8h, v13.8b 386 uxtl v14.8h, v14.8b 387 br x4 388 389.align 5 390colormatrix_float_ldu1: 391 ld1 {v20.8b}, [x1], #8 392 uxtl v20.8h, v20.8b 393 uxtl v12.4s, v20.4h 394 uxtl2 v20.4s, v20.8h 395 ucvtf v12.4s, v12.4s 396 ucvtf v20.4s, v20.4s 397 br x4 398 399.align 6 400colormatrix_float_ldu2: 401 ld2 {v20.8b,v21.8b}, [x1], #16 402 uxtl v20.8h, v20.8b 403 uxtl v21.8h, v21.8b 404 uxtl v12.4s, v20.4h 405 uxtl v13.4s, v21.4h 406 uxtl2 v20.4s, v20.8h 407 uxtl2 v21.4s, v21.8h 408 ucvtf v12.4s, v12.4s 409 ucvtf v13.4s, v13.4s 410 ucvtf v20.4s, v20.4s 411 ucvtf v21.4s, v21.4s 412 br x4 413 414.align 4 415colormatrix_int_ldu2: 416 ld2 {v12.8b,v13.8b}, [x1], #16 417 uxtl v12.8h, v12.8b 418 uxtl v13.8h, v13.8b 419 br x4 420 421.align 6 422colormatrix_float_stu4: 423 fcvtzs v24.4s, v8.4s, #1 424 fcvtzs v25.4s, v9.4s, #1 425 fcvtzs v26.4s, v10.4s, #1 426 fcvtzs v27.4s, v11.4s, #1 427 fcvtzs v28.4s, v16.4s, #1 428 fcvtzs v29.4s, v17.4s, #1 429 fcvtzs v30.4s, v18.4s, #1 430 fcvtzs v31.4s, v19.4s, #1 431 sqrshrun v24.4h, v24.4s, #1 432 sqrshrun v25.4h, v25.4s, #1 433 sqrshrun v26.4h, v26.4s, #1 434 sqrshrun v27.4h, v27.4s, #1 435 sqrshrun2 v24.8h, v28.4s, #1 436 sqrshrun2 v25.8h, v29.4s, #1 437 sqrshrun2 v26.8h, v30.4s, #1 438 sqrshrun2 v27.8h, v31.4s, #1 439 uqxtn v24.8b, v24.8h 440 uqxtn v25.8b, v25.8h 441 uqxtn v26.8b, v26.8h 442 uqxtn v27.8b, v27.8h 443 subs x2, x2, #8 444 st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32 445 blo colormatrix_float_end 446 br x9 447 448.align 5 449colormatrix_int_stu4: 450 uqxtn v12.8b, v8.8h 451 uqxtn v13.8b, v9.8h 452 uqxtn v14.8b, v10.8h 453 uqxtn v15.8b, v11.8h 454 subs x2, x2, #8 455 st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32 456 blo colormatrix_int_end 457 br x9 458 459.align 6 460colormatrix_float_stu3: 461 fcvtzs v24.4s, v8.4s, #1 462 fcvtzs v25.4s, v9.4s, #1 463 fcvtzs v26.4s, v10.4s, #1 464 fcvtzs v28.4s, v16.4s, #1 465 fcvtzs v29.4s, v17.4s, #1 466 fcvtzs v30.4s, v18.4s, #1 467 sqrshrun v24.4h, v24.4s, #1 468 sqrshrun v25.4h, v25.4s, #1 469 sqrshrun v26.4h, v26.4s, #1 470 sqrshrun2 v24.8h, v28.4s, #1 471 sqrshrun2 v25.8h, v29.4s, #1 472 sqrshrun2 v26.8h, v30.4s, #1 473 uqxtn v24.8b, v24.8h 474 uqxtn v25.8b, v25.8h 475 uqxtn v26.8b, v26.8h 476 movi v27.8b, #0 477 subs x2, x2, #8 478 st4 {v24.8b,v25.8b,v26.8b,v27.8b}, [x0], #32 479 blo colormatrix_float_end 480 br x9 481 482.align 4 483colormatrix_int_ldu1: 484 ld1 {v12.8b}, [x1], #8 485 uxtl v12.8h, v12.8b 486 br x4 487 488.align 5 489colormatrix_int_stu3: 490 uqxtn v12.8b, v8.8h 491 uqxtn v13.8b, v9.8h 492 uqxtn v14.8b, v10.8h 493 movi v15.8b, #0 494 subs x2, x2, #8 495 st4 {v12.8b,v13.8b,v14.8b,v15.8b}, [x0], #32 496 blo colormatrix_int_end 497 br x9 498 499.align 6 500colormatrix_float_stu2: 501 fcvtzs v24.4s, v8.4s, #1 502 fcvtzs v25.4s, v9.4s, #1 503 fcvtzs v28.4s, v16.4s, #1 504 fcvtzs v29.4s, v17.4s, #1 505 sqrshrun v24.4h, v24.4s, #1 506 sqrshrun v25.4h, v25.4s, #1 507 sqrshrun2 v24.8h, v28.4s, #1 508 sqrshrun2 v25.8h, v29.4s, #1 509 uqxtn v24.8b, v24.8h 510 uqxtn v25.8b, v25.8h 511 subs x2, x2, #8 512 st2 {v24.8b,v25.8b}, [x0], #16 513 blo colormatrix_float_end 514 br x9 515 516.align 5 517colormatrix_int_stu2: 518 uqxtn v12.8b, v8.8h 519 uqxtn v13.8b, v9.8h 520 subs x2, x2, #8 521 st2 {v12.8b,v13.8b}, [x0], #16 522 blo colormatrix_int_end 523 br x9 524 525.align 5 526colormatrix_int_stu1: 527 uqxtn v12.8b, v8.8h 528 subs x2, x2, #8 529 st1 {v12.8b}, [x0], #8 530 blo colormatrix_int_end 531 br x9 532 533colormatrix_float_ldf3: 534 ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64 535 ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64 536 br x4 537 538.align 6 539colormatrix_float_stu1: 540 fcvtzs v24.4s, v8.4s, #1 541 fcvtzs v28.4s, v16.4s, #1 542 sqrshrun v24.4h, v24.4s, #1 543 sqrshrun2 v24.8h, v28.4s, #1 544 uqxtn v24.8b, v24.8h 545 subs x2, x2, #8 546 st1 {v24.8b}, [x0], #8 547 blo colormatrix_float_end 548 br x9 549 550colormatrix_float_stf3: 551 movi v11.16b, #0 552 st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64 553 movi v19.16b, #0 554 subs x2, x2, #8 555 st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 556 blo colormatrix_float_end 557 br x9 558 559.align 5 560colormatrix_float_stf4: 561 st4 {v8.4s,v9.4s,v10.4s,v11.4s}, [x0], #64 562 subs x2, x2, #8 563 st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 564 blo colormatrix_float_end 565 br x9 566 567colormatrix_float_ldf4: 568 ld4 {v12.4s,v13.4s,v14.4s,v15.4s}, [x1], #64 569 ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64 570 br x4 571 572.align 5 573colormatrix_float_stf2: 574 st2 {v8.4s, v9.4s}, [x0], #32 575 subs x2, x2, #8 576 st2 {v16.4s, v17.4s}, [x0], #32 577 blo colormatrix_float_end 578 br x9 579 580colormatrix_float_ldf2: 581 ld2 {v12.4s,v13.4s}, [x1], #32 582 ld2 {v20.4s,v21.4s}, [x1], #32 583 br x4 584 585.align 5 586colormatrix_float_stf1: 587 st1 {v8.4s}, [x0], #16 588 subs x2, x2, #8 589 st1 {v16.4s}, [x0], #16 590 blo colormatrix_float_end 591 br x9 592 593colormatrix_float_ldf1: 594 ld1 {v12.4s}, [x1], #16 595 ld1 {v20.4s}, [x1], #16 596 br x4 597 598colormatrix_int_stu1_end: 599 uqxtn v12.8b, v8.8h 600 tbz x2, #2, 1f 601 st1 {v12.s}[1], [x0], #4 6021: tbz x2, #1, 1f 603 st1 {v12.h}[1], [x0], #2 6041: tbz x2, #0, 1f 605 st1 {v12.b}[1], [x0], #1 6061: b colormatrix_int_realend 607 608colormatrix_int_stu2_end: 609 uqxtn v12.8b, v8.8h 610 uqxtn v13.8b, v9.8h 611 zip1 v12.16b, v12.16b, v13.16b 612 tbz x2, #2, 1f 613 st1 {v12.d}[1], [x0], #8 6141: tbz x2, #1, 1f 615 st1 {v12.s}[1], [x0], #4 6161: tbz x2, #0, 1f 617 st1 {v12.h}[1], [x0], #2 6181: b colormatrix_int_realend 619 620colormatrix_int_stu3_end: 621 uqxtn v12.8b, v8.8h 622 uqxtn v13.8b, v9.8h 623 uqxtn v14.8b, v10.8h 624 movi v15.8b, #0 625 tbz x2, #2, 1f 626 st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4 627 st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4 628 st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4 629 st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4 6301: tbz x2, #1, 1f 631 st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4 632 st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4 6331: tbz x2, #0, 1f 634 st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4 6351: b colormatrix_int_realend 636 637colormatrix_int_stu4_end: 638 uqxtn v12.8b, v8.8h 639 uqxtn v13.8b, v9.8h 640 uqxtn v14.8b, v10.8h 641 uqxtn v15.8b, v11.8h 642 tbz x2, #2, 1f 643 st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4 644 st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4 645 st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4 646 st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4 6471: tbz x2, #1, 1f 648 st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4 649 st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4 6501: tbz x2, #0, 1f 651 st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4 6521: b colormatrix_int_realend 653 654 655colormatrix_int_ldu1_end: 656 tbz x2, #2, 1f 657 ld1 {v15.s}[3], [x1], #4 6581: tbz x2, #1, 1f 659 ld1 {v15.h}[5], [x1], #2 6601: tbz x2, #0, 1f 661 ld1 {v15.b}[9], [x1], #1 6621: uxtl2 v12.8h, v15.16b 663 br x4 664 665colormatrix_int_ldu2_end: 666 tbz x2, #2, 1f 667 ld1 {v15.d}[1], [x1], #8 6681: tbz x2, #1, 1f 669 ld1 {v15.s}[1], [x1], #4 6701: tbz x2, #0, 1f 671 ld1 {v15.h}[1], [x1], #2 6721: uzp1 v14.16b, v15.16b, v15.16b 673 uzp2 v15.16b, v15.16b, v15.16b 674 uxtl v12.8h, v14.8b 675 uxtl v13.8h, v15.8b 676 br x4 677 678colormatrix_int_ldu3_end: 679 tbz x2, #2, 1f 680 ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4 681 ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4 682 ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4 683 ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4 6841: tbz x2, #1, 1f 685 ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4 686 ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4 6871: tbz x2, #0, 1f 688 ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4 6891: uxtl v12.8h, v12.8b 690 uxtl v13.8h, v13.8b 691 uxtl v14.8h, v14.8b 692 br x4 693 694colormatrix_int_ldu4_end: 695 tbz x2, #2, 1f 696 ld4 {v12.b,v13.b,v14.b,v15.b}[4], [x1], #4 697 ld4 {v12.b,v13.b,v14.b,v15.b}[5], [x1], #4 698 ld4 {v12.b,v13.b,v14.b,v15.b}[6], [x1], #4 699 ld4 {v12.b,v13.b,v14.b,v15.b}[7], [x1], #4 7001: tbz x2, #1, 1f 701 ld4 {v12.b,v13.b,v14.b,v15.b}[2], [x1], #4 702 ld4 {v12.b,v13.b,v14.b,v15.b}[3], [x1], #4 7031: tbz x2, #0, 1f 704 ld4 {v12.b,v13.b,v14.b,v15.b}[1], [x1], #4 7051: uxtl v12.8h, v12.8b 706 uxtl v13.8h, v13.8b 707 uxtl v14.8h, v14.8b 708 uxtl v15.8h, v15.8b 709 br x4 710 711colormatrix_float_stu1_end: 712 fcvtzs v12.4s, v8.4s, #1 713 fcvtzs v13.4s, v16.4s, #1 714 sqrshrun v12.4h, v12.4s, #1 715 sqrshrun2 v12.8h, v13.4s, #1 716 uqxtn v12.8b, v12.8h 717 tbz x2, #2, 1f 718 st1 {v12.s}[1], [x0], #4 7191: tbz x2, #1, 1f 720 st1 {v12.h}[1], [x0], #2 7211: tbz x2, #0, 1f 722 st1 {v12.b}[1], [x0], #1 7231: b colormatrix_float_realend 724 725colormatrix_float_stu2_end: 726 fcvtzs v12.4s, v8.4s, #1 727 fcvtzs v13.4s, v9.4s, #1 728 fcvtzs v14.4s, v16.4s, #1 729 fcvtzs v15.4s, v17.4s, #1 730 sqrshrun v12.4h, v12.4s, #1 731 sqrshrun v13.4h, v13.4s, #1 732 sqrshrun v14.4h, v14.4s, #1 733 sqrshrun v15.4h, v15.4s, #1 734 zip1 v12.8h, v12.8h, v13.8h 735 zip1 v13.8h, v14.8h, v15.8h 736 uqxtn v12.8b, v12.8h 737 uqxtn2 v12.16b, v13.8h 738 tbz x2, #2, 1f 739 st1 {v12.d}[1], [x0], #8 7401: tbz x2, #1, 1f 741 st1 {v12.s}[1], [x0], #4 7421: tbz x2, #0, 1f 743 st1 {v12.h}[1], [x0], #2 7441: b colormatrix_float_realend 745 746colormatrix_float_stu3_end: 747 fcvtzs v24.4s, v8.4s, #1 748 fcvtzs v25.4s, v9.4s, #1 749 fcvtzs v26.4s, v10.4s, #1 750 fcvtzs v28.4s, v16.4s, #1 751 fcvtzs v29.4s, v17.4s, #1 752 fcvtzs v30.4s, v18.4s, #1 753 sqrshrun v24.4h, v24.4s, #1 754 sqrshrun v25.4h, v25.4s, #1 755 sqrshrun v26.4h, v26.4s, #1 756 sqrshrun2 v24.8h, v28.4s, #1 757 sqrshrun2 v25.8h, v29.4s, #1 758 sqrshrun2 v26.8h, v30.4s, #1 759 uqxtn v12.8b, v24.8h 760 uqxtn v13.8b, v25.8h 761 uqxtn v14.8b, v26.8h 762 movi v15.8b, #0 763 tbz x2, #2, 1f 764 st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4 765 st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4 766 st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4 767 st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4 7681: tbz x2, #1, 1f 769 st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4 770 st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4 7711: tbz x2, #0, 1f 772 st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4 7731: b colormatrix_float_realend 774 775colormatrix_float_stu4_end: 776 fcvtzs v24.4s, v8.4s, #1 777 fcvtzs v25.4s, v9.4s, #1 778 fcvtzs v26.4s, v10.4s, #1 779 fcvtzs v27.4s, v11.4s, #1 780 fcvtzs v28.4s, v16.4s, #1 781 fcvtzs v29.4s, v17.4s, #1 782 fcvtzs v30.4s, v18.4s, #1 783 fcvtzs v31.4s, v19.4s, #1 784 sqrshrun v24.4h, v24.4s, #1 785 sqrshrun v25.4h, v25.4s, #1 786 sqrshrun v26.4h, v26.4s, #1 787 sqrshrun v27.4h, v27.4s, #1 788 sqrshrun2 v24.8h, v28.4s, #1 789 sqrshrun2 v25.8h, v29.4s, #1 790 sqrshrun2 v26.8h, v30.4s, #1 791 sqrshrun2 v27.8h, v31.4s, #1 792 uqxtn v12.8b, v24.8h 793 uqxtn v13.8b, v25.8h 794 uqxtn v14.8b, v26.8h 795 uqxtn v15.8b, v27.8h 796 tbz x2, #2, 1f 797 st4 {v12.b,v13.b,v14.b,v15.b}[4], [x0], #4 798 st4 {v12.b,v13.b,v14.b,v15.b}[5], [x0], #4 799 st4 {v12.b,v13.b,v14.b,v15.b}[6], [x0], #4 800 st4 {v12.b,v13.b,v14.b,v15.b}[7], [x0], #4 8011: tbz x2, #1, 1f 802 st4 {v12.b,v13.b,v14.b,v15.b}[2], [x0], #4 803 st4 {v12.b,v13.b,v14.b,v15.b}[3], [x0], #4 8041: tbz x2, #0, 1f 805 st4 {v12.b,v13.b,v14.b,v15.b}[1], [x0], #4 8061: b colormatrix_float_realend 807 808colormatrix_float_stf1_end: 809 tbz x2, #2, 1f 810 st1 {v16.4s}, [x0], #16 8111: tbz x2, #1, 1f 812 st1 {v8.d}[1], [x0], #8 8131: tbz x2, #0, 1f 814 st1 {v8.s}[1], [x0], #4 8151: b colormatrix_float_realend 816 817colormatrix_float_stf2_end: 818 tbz x2, #2, 1f 819 st2 {v16.4s, v17.4s}, [x0], #32 8201: tbz x2, #1, 1f 821 st2 {v8.s,v9.s}[2], [x0], #8 822 st2 {v8.s,v9.s}[3], [x0], #8 8231: tbz x2, #0, 1f 824 st2 {v8.s,v9.s}[1], [x0], #8 8251: b colormatrix_float_realend 826 827colormatrix_float_stf3_end: 828 movi v11.16b, #0 829 movi v19.16b, #0 830colormatrix_float_stf4_end: 831 tbz x2, #2, 1f 832 st4 {v16.4s,v17.4s,v18.4s,v19.4s}, [x0], #64 8331: tbz x2, #1, 1f 834 st4 {v8.s,v9.s,v10.s,v11.s}[2], [x0], #16 835 st4 {v8.s,v9.s,v10.s,v11.s}[3], [x0], #16 8361: tbz x2, #0, 1f 837 st4 {v8.s,v9.s,v10.s,v11.s}[1], [x0], #16 8381: b colormatrix_float_realend 839 840colormatrix_float_ldu1_end: 841 tbz x2, #2, 1f 842 ld1 {v15.s}[1], [x1], #4 8431: tbz x2, #1, 1f 844 ld1 {v15.h}[1], [x1], #2 8451: tbz x2, #0, 1f 846 ld1 {v15.b}[1], [x1], #1 8471: uxtl v15.8h, v15.8b 848 uxtl v12.4s, v15.4h 849 uxtl2 v20.4s, v15.8h 850 ucvtf v12.4s, v12.4s 851 ucvtf v20.4s, v20.4s 852 br x4 853 854colormatrix_float_ldu2_end: 855 tbz x2, #2, 1f 856 ld1 {v15.d}[1], [x1], #8 8571: tbz x2, #1, 1f 858 ld1 {v15.s}[1], [x1], #4 8591: tbz x2, #0, 1f 860 ld1 {v15.h}[1], [x1], #2 8611: uxtl v14.8h, v15.8b 862 uxtl2 v15.8h, v15.16b 863 uzp1 v12.8h, v14.8h, v14.8h 864 uzp2 v13.8h, v14.8h, v14.8h 865 uzp1 v20.8h, v15.8h, v15.8h 866 uzp2 v21.8h, v15.8h, v15.8h 867 uxtl v12.4s, v12.4h 868 uxtl v13.4s, v13.4h 869 uxtl v20.4s, v20.4h 870 uxtl v21.4s, v21.4h 871 ucvtf v12.4s, v12.4s 872 ucvtf v13.4s, v13.4s 873 ucvtf v20.4s, v20.4s 874 ucvtf v21.4s, v21.4s 875 br x4 876 877colormatrix_float_ldu3_end: 878 tbz x2, #2, 1f 879 ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4 880 ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4 881 ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4 882 ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4 8831: tbz x2, #1, 1f 884 ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4 885 ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4 8861: tbz x2, #0, 1f 887 ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4 8881: uxtl v20.8h, v20.8b 889 uxtl v21.8h, v21.8b 890 uxtl v22.8h, v22.8b 891 uxtl v12.4s, v20.4h 892 uxtl v13.4s, v21.4h 893 uxtl v14.4s, v22.4h 894 uxtl2 v20.4s, v20.8h 895 uxtl2 v21.4s, v21.8h 896 uxtl2 v22.4s, v22.8h 897 ucvtf v12.4s, v12.4s 898 ucvtf v13.4s, v13.4s 899 ucvtf v14.4s, v14.4s 900 ucvtf v20.4s, v20.4s 901 ucvtf v21.4s, v21.4s 902 ucvtf v22.4s, v22.4s 903 br x4 904 905colormatrix_float_ldu4_end: 906 tbz x2, #2, 1f 907 ld4 {v20.b,v21.b,v22.b,v23.b}[4], [x1], #4 908 ld4 {v20.b,v21.b,v22.b,v23.b}[5], [x1], #4 909 ld4 {v20.b,v21.b,v22.b,v23.b}[6], [x1], #4 910 ld4 {v20.b,v21.b,v22.b,v23.b}[7], [x1], #4 9111: tbz x2, #1, 1f 912 ld4 {v20.b,v21.b,v22.b,v23.b}[2], [x1], #4 913 ld4 {v20.b,v21.b,v22.b,v23.b}[3], [x1], #4 9141: tbz x2, #0, 1f 915 ld4 {v20.b,v21.b,v22.b,v23.b}[1], [x1], #4 9161: uxtl v20.8h, v20.8b 917 uxtl v21.8h, v21.8b 918 uxtl v22.8h, v22.8b 919 uxtl v23.8h, v23.8b 920 uxtl v12.4s, v20.4h 921 uxtl v13.4s, v21.4h 922 uxtl v14.4s, v22.4h 923 uxtl v15.4s, v23.4h 924 uxtl2 v20.4s, v20.8h 925 uxtl2 v21.4s, v21.8h 926 uxtl2 v22.4s, v22.8h 927 uxtl2 v23.4s, v23.8h 928 ucvtf v12.4s, v12.4s 929 ucvtf v13.4s, v13.4s 930 ucvtf v14.4s, v14.4s 931 ucvtf v15.4s, v15.4s 932 ucvtf v20.4s, v20.4s 933 ucvtf v21.4s, v21.4s 934 ucvtf v22.4s, v22.4s 935 ucvtf v23.4s, v23.4s 936 br x4 937 938colormatrix_float_ldf1_end: 939 tbz x2, #2, 1f 940 ld1 {v20.4s}, [x1], #16 9411: tbz x2, #1, 1f 942 ld1 {v12.d}[1], [x1], #8 9431: tbz x2, #0, 1f 944 ld1 {v12.s}[1], [x1], #4 9451: br x4 946 947colormatrix_float_ldf2_end: 948 tbz x2, #2, 1f 949 ld2 {v20.4s,v21.4s}, [x1], #32 9501: tbz x2, #1, 1f 951 ld2 {v12.s,v13.s}[2], [x1], #8 952 ld2 {v12.s,v13.s}[3], [x1], #8 9531: tbz x2, #0, 1f 954 ld2 {v12.s,v13.s}[1], [x1], #8 9551: br x4 956 957colormatrix_float_ldf3_end: 958colormatrix_float_ldf4_end: 959 tbz x2, #2, 1f 960 ld4 {v20.4s,v21.4s,v22.4s,v23.4s}, [x1], #64 9611: tbz x2, #1, 1f 962 ld4 {v12.s,v13.s,v14.s,v15.s}[2], [x1], #16 963 ld4 {v12.s,v13.s,v14.s,v15.s}[3], [x1], #16 9641: tbz x2, #0, 1f 965 ld4 {v12.s,v13.s,v14.s,v15.s}[1], [x1], #16 9661: br x4 967 968/* void rsdIntrinsicColorMatrix_int_K( 969 * void *out, // x0 970 * void const *in, // x1 971 * size_t count, // x2 972 * fntab_t const *fns, // x3 973 * int16_t const *mult, // x4 974 * int32_t const *add); // x5 975 */ 976ENTRY(rsdIntrinsicColorMatrix_int_K) 977 sub x7, sp, #32 978 sub sp, sp, #64 979 st1 {v8.1d-v11.1d}, [sp] 980 st1 {v12.1d-v15.1d}, [x7] 981 982 ld1 {v0.8h,v1.8h}, [x4], #32 983 ld1 {v4.4s}, [x5], #16 984 985 ldp x4,x5, [x3],#16 986 ldp x6,x7, [x3],#16 987 ldp x8,x9, [x3],#16 988 989 dup v12.4s, v4.s[0] 990 dup v13.4s, v4.s[1] 991 dup v14.4s, v4.s[2] 992 dup v15.4s, v4.s[3] 993 sqshrun v8.4h, v12.4s, #8 994 sqshrun2 v8.8h, v12.4s, #8 995 sqshrun v9.4h, v13.4s, #8 996 sqshrun2 v9.8h, v13.4s, #8 997 sqshrun v10.4h, v14.4s, #8 998 sqshrun2 v10.8h, v14.4s, #8 999 sqshrun v11.4h, v15.4s, #8 1000 sqshrun2 v11.8h, v15.4s, #8 1001 1002 subs x2, x2, #8 1003 blo colormatrix_int_end 1004 br x9 1005 1006colormatrix_int_end: 1007 adds x2, x2, #8 1008 bls colormatrix_int_realend 1009 mov x16, x8 1010 ldp x8, x9, [x3], #16 1011 cmp x4, x16 1012 csel x4, x8, x4, eq 1013 cmp x5, x16 1014 csel x5, x8, x5, eq 1015 cmp x6, x16 1016 csel x6, x8, x6, eq 1017 cmp x7, x16 1018 csel x7, x8, x7, eq 1019 br x9 1020 1021colormatrix_int_realend: 1022 ld1 {v8.1d-v11.1d}, [sp], #32 1023 ld1 {v12.1d-v15.1d}, [sp], #32 1024 ret 1025END(rsdIntrinsicColorMatrix_int_K) 1026 1027/* void rsdIntrinsicColorMatrixSetup_int_K( 1028 * fntab_t const *fns, // x0 1029 * uint32_t mask, // x1 1030 * int dt, // x2 1031 * int st); // x3 1032 */ 1033ENTRY(rsdIntrinsicColorMatrixSetup_int_K) 1034 adrp x7, 2f 1035 add x7, x7, :lo12:2f 1036 add x4, x7, x2, LSL #3 1037 ldrsw x2, [x4], #4 1038 ldrsw x4, [x4] 1039 add x2, x2, x7 1040 add x4, x4, x7 1041 adrp x7, 3f 1042 add x7, x7, :lo12:3f 1043 add x5, x7, x3, LSL #3 1044 ldrsw x3, [x5], #4 1045 ldrsw x5, [x5] 1046 add x3, x3, x7 1047 add x5, x5, x7 1048 stp x2, x3, [x0, #32] 1049 stp x4, x5, [x0, #48] 1050 1051/* For each column function, if the matrix is all zeroes then write NULL, 1052 * otherwise look up the appropriate function and store that. */ 1053 1054 mov x3, #4 1055 adrp x7, 4f 1056 add x7, x7, :lo12:4f 10571: ands x2, x1, #15 1058 beq 9f 1059 and x2, x1, #31 1060 lsl x2, x2, #4 1061 ldrsw x2, [x7, x2] 1062 add x2, x2, x7 10639: str x2, [x0], #8 1064 lsr x1, x1, #5 1065 add x7, x7, #4 1066 subs x3, x3, #1 1067 bne 1b 1068 1069/* For every NULL entry, copy the non-NULL entry that follows it, or the store 1070 * function. */ 1071 1072 ldr x2, [x0] 1073 mov x3, #4 10741: ldr x1, [x0, #-8]! 1075 cmp x1, #0 1076 csel x2, x1, x2, ne 1077 str x2, [x0] 1078 subs x3, x3, #1 1079 bne 1b 1080 ret 1081 1082END(rsdIntrinsicColorMatrixSetup_int_K) 1083.rodata 1084 .align 4 10852: .word colormatrix_int_stu1-2b 1086 .word colormatrix_int_stu1_end-2b 1087 .word colormatrix_int_stu2-2b 1088 .word colormatrix_int_stu2_end-2b 1089 .word colormatrix_int_stu3-2b 1090 .word colormatrix_int_stu3_end-2b 1091 .word colormatrix_int_stu4-2b 1092 .word colormatrix_int_stu4_end-2b 10933: .word colormatrix_int_ldu1-3b 1094 .word colormatrix_int_ldu1_end-3b 1095 .word colormatrix_int_ldu2-3b 1096 .word colormatrix_int_ldu2_end-3b 1097 .word colormatrix_int_ldu3-3b 1098 .word colormatrix_int_ldu3_end-3b 1099 .word colormatrix_int_ldu4-3b 1100 .word colormatrix_int_ldu4_end-3b 11014: 1102.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1103 .word colormatrix_int_col0_\i-4b 1104 .word colormatrix_int_col1_\i-4b-4 1105 .word colormatrix_int_col2_\i-4b-8 1106 .word colormatrix_int_col3_\i-4b-12 1107.endr 1108.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 1109 .word colormatrix_int_col0_n\i-4b 1110 .word colormatrix_int_col1_n\i-4b-4 1111 .word colormatrix_int_col2_n\i-4b-8 1112 .word colormatrix_int_col3_n\i-4b-12 1113.endr 1114 1115 1116/* void rsdIntrinsicColorMatrix_float_K( 1117 * void *out, // x0 1118 * void const *in, // x1 1119 * size_t count, // x2 1120 * fntab_t const *fns, // x3 1121 * float const *mult, // x4 1122 * float const *add); // x5 1123 */ 1124ENTRY(rsdIntrinsicColorMatrix_float_K) 1125 sub x7, sp, #32 1126 sub sp, sp, #64 1127 st1 {v8.1d-v11.1d}, [sp] 1128 st1 {v12.1d-v15.1d}, [x7] 1129 1130 ld1 {v0.4s,v1.4s,v2.4s,v3.4s}, [x4], #64 1131 ld1r {v4.4s}, [x5], #4 1132 ld1r {v5.4s}, [x5], #4 1133 ld1r {v6.4s}, [x5], #4 1134 ld1r {v7.4s}, [x5], #4 1135 1136 ldp x4,x5, [x3], #16 1137 ldp x6,x7, [x3], #16 1138 ldp x8,x9, [x3], #16 1139 1140 mov v8.16b, v4.16b 1141 mov v9.16b, v5.16b 1142 mov v10.16b, v6.16b 1143 mov v11.16b, v7.16b 1144 1145 mov v16.16b, v4.16b 1146 mov v17.16b, v5.16b 1147 mov v18.16b, v6.16b 1148 mov v19.16b, v7.16b 1149 1150 subs x2, x2, #8 1151 blo colormatrix_float_end 1152 br x9 1153 1154colormatrix_float_end: 1155 adds x2, x2, #8 1156 bls colormatrix_int_realend 1157 mov x16, x8 1158 ldp x8,x9, [x3], #16 1159 cmp x4, x16 1160 csel x4, x8, x4, eq 1161 cmp x5, x16 1162 csel x5, x8, x5, eq 1163 cmp x6, x16 1164 csel x6, x8, x6, eq 1165 cmp x7, x16 1166 csel x7, x8, x7, eq 1167 br x9 1168 1169colormatrix_float_realend: 1170 ld1 {v8.1d-v11.1d}, [sp], #32 1171 ld1 {v12.1d-v15.1d}, [sp], #32 1172 ret 1173END(rsdIntrinsicColorMatrix_float_K) 1174 1175/* void rsdIntrinsicColorMatrixSetup_float_K( 1176 * fntab_t const *fns, // x0 1177 * uint32_t mask, // x1 1178 * int dt, // x2 1179 * int st); // x3 1180 */ 1181ENTRY(rsdIntrinsicColorMatrixSetup_float_K) 1182 adrp x7, 2f 1183 add x7, x7, :lo12:2f 1184 add x4, x7, x2, LSL #3 1185 ldrsw x2, [x4], #4 1186 ldrsw x4, [x4] 1187 add x2, x2, x7 1188 add x4, x4, x7 1189 adrp x7, 3f 1190 add x7, x7, :lo12:3f 1191 add x5, x7, x3, LSL #3 1192 ldrsw x3, [x5], #4 1193 ldrsw x5, [x5] 1194 add x3, x3, x7 1195 add x5, x5, x7 1196 stp x2, x3, [x0, #32] 1197 stp x4, x5, [x0, #48] 1198 1199/* For each column function, if the matrix is all zeroes then write NULL, 1200 * otherwise look up the appropriate function and store that. */ 1201 1202 mov x3, #4 1203 adrp x7, 4f 1204 add x7, x7, :lo12:4f 12051: ands x2, x1, #15 1206 beq 9f 1207 and x2, x1, #31 1208 lsl x2, x2, #4 1209 ldrsw x2, [x7, x2] 1210 add x2, x2, x7 12119: str x2, [x0], #8 1212 lsr x1, x1, #5 1213 add x7, x7, #4 1214 subs x3, x3, #1 1215 bne 1b 1216 1217/* For every NULL entry, copy the non-NULL entry that follows it, or the store 1218 * function. */ 1219 1220 ldr x2, [x0] 1221 mov x3, #4 12221: ldr x1, [x0, #-8]! 1223 cmp x1, #0 1224 csel x2, x1, x2, ne 1225 str x2, [x0] 1226 subs x3, x3, #1 1227 bne 1b 1228 ret 1229 1230END(rsdIntrinsicColorMatrixSetup_float_K) 1231.rodata 1232 .align 4 12332: .word colormatrix_float_stu1-2b 1234 .word colormatrix_float_stu1_end-2b 1235 .word colormatrix_float_stu2-2b 1236 .word colormatrix_float_stu2_end-2b 1237 .word colormatrix_float_stu3-2b 1238 .word colormatrix_float_stu3_end-2b 1239 .word colormatrix_float_stu4-2b 1240 .word colormatrix_float_stu4_end-2b 1241 .word colormatrix_float_stf1-2b 1242 .word colormatrix_float_stf1_end-2b 1243 .word colormatrix_float_stf2-2b 1244 .word colormatrix_float_stf2_end-2b 1245 .word colormatrix_float_stf3-2b 1246 .word colormatrix_float_stf3_end-2b 1247 .word colormatrix_float_stf4-2b 1248 .word colormatrix_float_stf4_end-2b 12493: .word colormatrix_float_ldu1-3b 1250 .word colormatrix_float_ldu1_end-3b 1251 .word colormatrix_float_ldu2-3b 1252 .word colormatrix_float_ldu2_end-3b 1253 .word colormatrix_float_ldu3-3b 1254 .word colormatrix_float_ldu3_end-3b 1255 .word colormatrix_float_ldu4-3b 1256 .word colormatrix_float_ldu4_end-3b 1257 .word colormatrix_float_ldf1-3b 1258 .word colormatrix_float_ldf1_end-3b 1259 .word colormatrix_float_ldf2-3b 1260 .word colormatrix_float_ldf2_end-3b 1261 .word colormatrix_float_ldf3-3b 1262 .word colormatrix_float_ldf3_end-3b 1263 .word colormatrix_float_ldf4-3b 1264 .word colormatrix_float_ldf4_end-3b 12654: 1266.irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1267 .word colormatrix_float_col0_\i-4b 1268 .word colormatrix_float_col1_\i-4b-4 1269 .word colormatrix_float_col2_\i-4b-8 1270 .word colormatrix_float_col3_\i-4b-12 1271.endr 1272.irp i, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 1273 .word colormatrix_float_col0_n\i-4b 1274 .word colormatrix_float_col1_n\i-4b-4 1275 .word colormatrix_float_col2_n\i-4b-8 1276 .word colormatrix_float_col3_n\i-4b-12 1277.endr 1278