Lines Matching refs:b
99 ld1 {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5
102 ld1 {v8.8b, v9.8b}, [x0], x1 //p1 values are loaded into q4
104 ld1 {v6.8b, v7.8b}, [x0], x1 //p0 values are loaded into q3
105 uxtl v12.8h, v12.8b //q6 = uc_Bs in each 16 bt scalar
106 ld1 {v0.8b, v1.8b}, [x0], x1 //q0 values are loaded into q0
110 uabd v26.16b, v8.16b, v6.16b
111 ld1 {v2.8b, v3.8b}, [x0], x1 //q1 values are loaded into q1
114 uabd v22.16b, v6.16b, v0.16b
116 uabd v24.16b, v2.16b, v0.16b
117 ld1 {v4.8b, v5.8b}, [x0], x1 //q2 values are loaded into q2
118 tbl v14.8b, {v16.16b}, v12.8b //
120 dup v20.16b, w2 //Q10 contains alpha
121 dup v16.16b, w3 //Q8 contains beta
124 uabd v28.16b, v10.16b, v6.16b
125 uabd v30.16b, v4.16b, v0.16b
128 cmhs v18.16b, v22.16b, v20.16b
129 cmhs v24.16b, v24.16b, v16.16b
130 cmhs v26.16b, v26.16b, v16.16b
131 cmhi v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta)
132 cmhi v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta)
134 orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
135 usubl v30.8h, v1.8b, v7.8b //
136 usubl v24.8h, v0.8b, v6.8b //Q15,Q12 = (q0 - p0)
137 …orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) |…
138 usubl v28.8h, v8.8b, v2.8b //Q14 = (p1 - q1)L
141 usubl v30.8h, v9.8b, v3.8b //Q15 = (p1 - q1)H
142 bic v12.16b, v12.16b , v18.16b //final condition
145 sub v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta)
146 urhadd v16.16b, v6.16b , v0.16b //Q8 = ((p0+q0+1) >> 1)
148 sqrshrn v24.8b, v24.8h, #3 //
149 sqrshrn v25.8b, v26.8h, #3 //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
151 sub v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta)
152 and v20.16b, v20.16b , v12.16b //
153 and v22.16b, v22.16b , v12.16b //
154 abs v26.16b, v24.16b //Q13 = ABS (i_macro)
155 uaddl v28.8h, v17.8b, v11.8b //
156 uaddl v10.8h, v16.8b, v10.8b //Q14,Q5 = p2 + (p0+q0+1)>>1
157 uaddl v30.8h, v17.8b, v5.8b //
158 umin v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
159 ushll v26.8h, v9.8b, #1 //
160 uaddl v4.8h, v16.8b, v4.8b //Q15,Q2 = q2 + (p0+q0+1)>>1
161 ushll v16.8h, v8.8b, #1 //Q13,Q8 = (p1<<1)
162 … and v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd
165 ushll v16.8h, v2.8b, #1 //
166 ushll v26.8h, v3.8b, #1 //Q13,Q8 = (q1<<1)
167 sqshrn v29.8b, v28.8h, #1 //
168 sqshrn v28.8b, v10.8h, #1 //Q14 = i_macro_p1
172 neg v26.16b, v14.16b //Q13 = -C0
173 smin v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1)
174 cmge v24.16b, v24.16b, #0
175 sqshrn v31.8b, v30.8h, #1 //
176 sqshrn v30.8b, v4.8h, #1 //Q15 = i_macro_q1
178 smax v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) )
179 uqadd v16.16b, v6.16b , v18.16b //Q8 = p0 + delta
180 uqsub v6.16b, v6.16b , v18.16b //Q3 = p0 - delta
181 smin v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1)
182 and v28.16b, v20.16b , v28.16b //condition check Ap<beta
183 uqadd v14.16b, v0.16b , v18.16b //Q7 = q0 + delta
184 uqsub v0.16b, v0.16b , v18.16b //Q0 = q0 - delta
185 smax v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) )
186 bif v16.16b, v6.16b , v24.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
187 bif v0.16b, v14.16b , v24.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
188 add v28.16b, v28.16b , v8.16b //
189 and v30.16b, v22.16b , v30.16b //condition check Aq<beta
190 st1 {v16.16b}, [x7], x1 //writting back filtered value of p0
191 add v30.16b, v30.16b , v2.16b //
192 st1 {v0.16b}, [x7], x1 //writting back filtered value of q0
193 st1 {v28.16b}, [x6] //writting back filtered value of p1
194 st1 {v30.16b}, [x7], x1 //writting back filtered value of q1
245 dup v0.16b, w2 //duplicate alpha
247 dup v2.16b, w3 //duplicate beta
253 ld1 {v4.8b, v5.8b}, [x0], x1 //load q0 to Q2, q0 = q0 + src_strd
254 ld1 {v6.8b, v7.8b}, [x12] //load p0 to Q3
255 ld1 {v8.8b, v9.8b}, [x0], x1 //load q1 to Q4, q0 = q0 + src_strd
256 ld1 {v10.8b, v11.8b}, [x14] //load p1 to Q5
263 uabd v12.16b , v4.16b, v6.16b
264 uabd v14.16b , v8.16b, v4.16b
265 uabd v16.16b , v10.16b, v6.16b
266 cmhs v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha
267 cmhs v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta
268 cmhs v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta
269 movi v20.16b, #2
270 orr v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
271 ld1 {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd
273 …orr v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p…
274 usra v20.16b, v0.16b, #2 //alpha >>2 +2
275 uabd v22.16b , v14.16b, v4.16b
276 uaddl v24.8h, v4.8b, v6.8b //p0+q0 L
277 uaddl v26.8h, v5.8b, v7.8b //p0+q0 H
278 cmhi v22.16b, v2.16b , v22.16b //Aq < Beta
279 cmhi v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2))
281 uaddw v28.8h, v24.8h , v8.8b //p0+q0+q1 L
282 uaddw v30.8h, v26.8h , v9.8b //p0+q0+q1 H
283 and v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
287 uaddw v16.8h, v16.8h , v14.8b //2*(p0+q0+q1)+q2 L
288 uaddw v0.8h, v0.8h , v15.8b //2*(p0+q0+q1)+q2 H
289 uaddw v16.8h, v16.8h , v10.8b //2*(p0+q0+q1)+q2 +p1 L
290 uaddw v0.8h, v0.8h , v11.8b //2*(p0+q0+q1)+q2 +p1 H
291 rshrn v12.8b, v16.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
292 rshrn v13.8b, v0.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
295 uaddl v16.8h, v8.8b, v8.8b //2*q1 L
296 uaddl v0.8h, v9.8b, v9.8b //2*q1 H
297 uaddw v16.8h, v16.8h , v4.8b //2*q1+q0 L
298 uaddw v0.8h, v0.8h , v5.8b //2*q1+q0 H
299 uaddw v16.8h, v16.8h , v10.8b //2*q1+q0+p1 L
300 uaddw v0.8h, v0.8h , v11.8b //2*q1+q0+p1 H
301 rshrn v16.8b, v16.8h, #2 //(2*q1+q0+p1+2)>>2 L [q0"]
302 rshrn v17.8b, v0.8h, #2 //(2*q1+q0+p1+2)>>2 H [q0"]
304 uaddw v28.8h, v28.8h , v14.8b //p0+q0+q1+q2 L
305 uaddw v30.8h, v30.8h , v15.8b //p0+q0+q1+q2 H
306 ld1 {v0.8b, v1.8b}, [x0], x1 //load q3 to Q0, q0 = q0 + src_strd
308 bit v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn
310 …bic v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 …
312 rshrn v12.8b, v28.8h, #2 //(p0+q0+q1+q2+2)>>2 L [q1']
313 rshrn v13.8b, v30.8h, #2 //(p0+q0+q1+q2+2)>>2 H [q1']
315 bif v4.16b, v16.16b , v18.16b //choose q0 or filtered q0
317 uaddl v16.8h, v14.8b, v0.8b //q2+q3,L
318 uaddl v0.8h, v15.8b, v1.8b //q2+q3,H
320 st1 {v4.8b, v5.8b}, [x0], x1 //store q0
324 rshrn v0.8b, v28.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
325 rshrn v1.8b, v30.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
327 ld1 {v30.8b, v31.8b}, [x3] //load p2 to Q15
329 bif v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1
331 uabd v16.16b , v30.16b, v6.16b
332 uaddw v24.8h, v24.8h , v10.8b //p0+q0+p1 L
333 bif v0.16b, v14.16b , v22.16b //choose q2 or filtered q2
335 uaddw v26.8h, v26.8h , v11.8b //p0+q0+p1 H
336 st1 {v12.8b, v13.8b}, [x0], x1 //store q1
337 cmhi v16.16b, v2.16b , v16.16b //Ap < Beta
340 st1 {v0.8b, v1.8b}, [x0], x1 //store q2
341 and v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
342 uaddw v28.8h, v28.8h , v30.8b //2*(p0+q0+p1)+p2 l
343 uaddw v4.8h, v4.8h , v31.8b //2*(p0+q0+p1)+p2 H
344 uaddw v28.8h, v28.8h , v8.8b //2*(p0+q0+p1)+p2+q1 L
345 uaddw v4.8h, v4.8h , v9.8b //2*(p0+q0+p1)+p2+q1 H
346 rshrn v28.8b, v28.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0'
347 rshrn v29.8b, v4.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0'
349 movi v0.8b, #2
351 uaddl v2.8h, v6.8b, v8.8b //p0+q1 L
352 umlal v2.8h, v10.8b, v0.8b //2*p1+p0+q1 L
353 uaddl v16.8h, v7.8b, v9.8b //p0+q1 H
354 umlal v16.8h, v11.8b, v0.8b //2*p1+p0+q1 H
355 uaddw v12.8h, v24.8h , v30.8b //(p0+q0+p1) +p2 L
356 ld1 {v24.8b, v25.8b}, [x2] //load p3,Q12
358 uaddw v4.8h, v26.8h , v31.8b //(p0+q0+p1) +p2 H
359 uaddl v8.8h, v30.8b, v24.8b //p2+p3 L
360 rshrn v26.8b, v12.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' L
361 rshrn v2.8b, v2.8h, #2 //(2*p1+p0+q1+2)>>2,p0"L
362 rshrn v27.8b, v4.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' H
363 rshrn v3.8b, v16.8h, #2 //(2*p1+p0+q1+2)>>2,p0" H
366 uaddl v16.8h, v31.8b, v25.8b //p2+p3 H
369 …bic v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 …
371 bit v2.16b, v28.16b , v20.16b //choosing between po' and p0"
373 rshrn v12.8b, v12.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
374 rshrn v13.8b, v4.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
376 bif v6.16b, v2.16b , v18.16b //choosing between p0 and filtered value of p0
377 bit v10.16b, v26.16b , v16.16b //choosing between p1 and p1'
378 bit v30.16b, v12.16b , v16.16b //choosing between p2 and p2'
379 st1 {v6.16b}, [x12] //store p0
380 st1 {v10.16b}, [x14] //store p1
381 st1 {v30.16b}, [x3] //store p2
441 ld1 {v0.8b}, [x0], x1 //row1
442 ld1 {v2.8b}, [x0], x1 //row2
443 ld1 {v4.8b}, [x0], x1 //row3
445 ld1 {v6.8b}, [x0], x1 //row4
448 ld1 {v8.8b}, [x0], x1 //row5
449 uxtl v18.8h, v18.8b //q6 = uc_Bs in each 16 bt scalar
450 ld1 {v10.8b}, [x0], x1 //row6
451 ld1 {v12.8b}, [x0], x1 //row7
452 tbl v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs]
453 ld1 {v14.8b}, [x0], x1 //row8
454 ld1 {v1.8b}, [x0], x1 //row9
456 ld1 {v3.8b}, [x0], x1 //row10
457 ld1 {v5.8b}, [x0], x1 //row11
458 ld1 {v7.8b}, [x0], x1 //row12
460 ld1 {v9.8b}, [x0], x1 //row13
461 ld1 {v11.8b}, [x0], x1 //row14
462 ld1 {v13.8b}, [x0], x1 //row15
464 ld1 {v15.8b}, [x0], x1 //row16
469 trn1 v21.8b, v0.8b, v2.8b
470 trn2 v2.8b, v0.8b, v2.8b //row1 &2
471 mov v0.8b, v21.8b
472 trn1 v21.8b, v4.8b, v6.8b
473 trn2 v6.8b, v4.8b, v6.8b //row3&row4
474 mov v4.8b, v21.8b
475 trn1 v21.8b, v8.8b, v10.8b
476 trn2 v10.8b, v8.8b, v10.8b //row5&6
477 mov v8.8b, v21.8b
478 trn1 v21.8b, v12.8b, v14.8b
479 trn2 v14.8b, v12.8b, v14.8b //row7 & 8
480 mov v12.8b, v21.8b
481 trn1 v21.8b, v1.8b, v3.8b
482 trn2 v3.8b, v1.8b, v3.8b //row9 &10
483 mov v1.8b, v21.8b
484 trn1 v21.8b, v5.8b, v7.8b
485 trn2 v7.8b, v5.8b, v7.8b //row11 & 12
486 mov v5.8b, v21.8b
487 trn1 v21.8b, v9.8b, v11.8b
488 trn2 v11.8b, v9.8b, v11.8b //row13 &14
489 mov v9.8b, v21.8b
490 trn1 v21.8b, v13.8b, v15.8b
491 trn2 v15.8b, v13.8b, v15.8b //row15 & 16
492 mov v13.8b, v21.8b
496 mov v2.8b, v21.8b
499 mov v10.8b, v21.8b
502 mov v3.8b, v21.8b
505 mov v11.8b, v21.8b
508 mov v6.8b, v21.8b
511 mov v7.8b, v21.8b
515 mov v0.8b, v21.8b
518 mov v8.8b, v21.8b
521 mov v1.8b, v21.8b
524 mov v9.8b, v21.8b
527 mov v0.8b, v21.8b
530 mov v1.8b, v21.8b
535 mov v2.8b, v21.8b
538 urhadd v20.16b, v6.16b , v8.16b //((p0 + q0 + 1) >> 1)
542 mov v3.8b, v31.8b
543 movi v19.8b, #2
548 mov v4.8b, v31.8b
549 uabd v22.16b , v6.16b, v8.16b //ABS(q1 - q0)
552 mov v5.8b, v31.8b
559 uaddl v24.8h, v20.8b, v2.8b //(p2 + ((p0 + q0 + 1) >> 1) L
561 uaddl v26.8h, v21.8b, v3.8b //(p2 + ((p0 + q0 + 1) >> 1) H
562 umlsl v24.8h, v4.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
563 umlsl v26.8h, v5.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
564 dup v28.16b, w2 //alpha
565 cmhs v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
566 dup v28.16b, w3 //beta
567 uabd v30.16b , v10.16b, v8.16b //ABS(q1 - q0)
568 sqshrn v24.8b, v24.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
569 sqshrn v25.8b, v26.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
571 cmhs v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
572 uabd v26.16b , v4.16b, v6.16b //ABS(q1 - q0)
574 smin v24.16b, v24.16b , v16.16b //min(deltap1 ,C0)
575 orr v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
576 neg v30.16b, v16.16b //-C0
577 cmhs v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
578 smax v24.16b, v24.16b , v30.16b //max(deltap1,-C0)
579 …orr v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1…
581 uaddl v18.8h, v20.8b, v12.8b //q2 + ((p0 + q0 + 1) >> 1) L
583 usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) L
584 uaddl v20.8h, v21.8b, v13.8b //q2 + ((p0 + q0 + 1) >> 1) H
585 usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
586 usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) H
587 …orr v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 -…
588 usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
589 sqshrn v18.8b, v18.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
590 uabd v22.16b , v2.16b, v6.16b //ABS(q1 - q0)
591 sqshrn v19.8b, v20.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
593 uabd v20.16b , v12.16b, v8.16b //ABS(q1 - q0)
594 cmhi v22.16b, v28.16b , v22.16b //Ap < Beta
595 smin v18.16b, v18.16b , v16.16b //min(delatq1,C0)
596 cmhi v20.16b, v28.16b , v20.16b //Aq <Beta
597 usubl v28.8h, v8.8b, v6.8b //(q0 - p0) L
598 smax v18.16b, v18.16b , v30.16b //max(deltaq1,-C0)
599 usubl v30.8h, v9.8b, v7.8b //(q0 - p0) H
601 sub v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta)
603 uaddw v28.8h, v28.8h , v4.8b //((q0 - p0) << 2) + (p1 L
604 uaddw v30.8h, v30.8h , v5.8b //((q0 - p0) << 2) + (p1 H
605 usubw v28.8h, v28.8h , v10.8b //((q0 - p0) << 2) + (p1 - q1) L
606 usubw v30.8h, v30.8h , v11.8b //((q0 - p0) << 2) + (p1 - q1) H
607 bic v22.16b, v22.16b , v26.16b //final condition for p1
608 rshrn v28.8b, v28.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
609 rshrn v29.8b, v30.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
611 sub v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta)
612 bic v20.16b, v20.16b , v26.16b //final condition for q1
613 abs v30.16b, v28.16b //abs(delta)
614 and v24.16b, v24.16b , v22.16b //delatp1
615 and v18.16b, v18.16b , v20.16b //delta q1
616 umin v30.16b, v30.16b , v16.16b //min((abs(delta),C)
617 add v4.16b, v4.16b , v24.16b //p1+deltap1
618 add v10.16b, v10.16b , v18.16b //q1+deltaq1
621 bic v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only
623 cmge v28.16b, v28.16b , #0
624 uqsub v22.16b, v6.16b , v30.16b //clip(p0-delta)
626 trn1 v21.8b, v0.8b, v2.8b
627 trn2 v2.8b, v0.8b, v2.8b //row1 &2
628 mov v0.8b, v21.8b
629 uqadd v6.16b, v6.16b , v30.16b //clip(p0+delta)
631 trn1 v21.8b, v1.8b, v3.8b
632 trn2 v3.8b, v1.8b, v3.8b //row9 &10
633 mov v1.8b, v21.8b
634 uqadd v24.16b, v8.16b , v30.16b //clip(q0+delta)
635 trn1 v21.8b, v12.8b, v14.8b
636 trn2 v14.8b, v12.8b, v14.8b //row7 & 8
637 mov v12.8b, v21.8b
638 uqsub v8.16b, v8.16b , v30.16b //clip(q0-delta)
639 trn1 v21.8b, v13.8b, v15.8b
640 trn2 v15.8b, v13.8b, v15.8b //row15 & 16
641 mov v13.8b, v21.8b
642 bif v6.16b, v22.16b , v28.16b //p0
643 bif v8.16b, v24.16b , v28.16b //q0
646 trn1 v21.8b, v4.8b, v6.8b
647 trn2 v6.8b, v4.8b, v6.8b //row3&row4
648 mov v4.8b, v21.8b
649 trn1 v21.8b, v8.8b, v10.8b
650 trn2 v10.8b, v8.8b, v10.8b //row5&6
651 mov v8.8b, v21.8b
652 trn1 v21.8b, v5.8b, v7.8b
653 trn2 v7.8b, v5.8b, v7.8b //row11 & 12
654 mov v5.8b, v21.8b
655 trn1 v21.8b, v9.8b, v11.8b
656 trn2 v11.8b, v9.8b, v11.8b //row13 &14
657 mov v9.8b, v21.8b
660 mov v2.8b, v21.8b
663 mov v10.8b, v21.8b
666 mov v3.8b, v21.8b
669 mov v11.8b, v21.8b
672 mov v6.8b, v21.8b
675 mov v7.8b, v21.8b
679 mov v0.8b, v21.8b
682 mov v8.8b, v21.8b
685 mov v1.8b, v21.8b
688 mov v9.8b, v21.8b
692 mov v0.8b, v21.8b
695 mov v1.8b, v21.8b
698 mov v2.8b, v21.8b
701 mov v3.8b, v21.8b
704 mov v4.8b, v21.8b
707 mov v5.8b, v21.8b
708 st1 {v0.8b}, [x0], x1 //row1
709 st1 {v2.8b}, [x0], x1 //row2
710 st1 {v4.8b}, [x0], x1 //row3
711 st1 {v6.8b}, [x0], x1 //row4
712 st1 {v8.8b}, [x0], x1 //row5
713 st1 {v10.8b}, [x0], x1 //row6
714 st1 {v12.8b}, [x0], x1 //row7
715 st1 {v14.8b}, [x0], x1 //row8
716 st1 {v1.8b}, [x0], x1 //row9
717 st1 {v3.8b}, [x0], x1 //row10
718 st1 {v5.8b}, [x0], x1 //row11
719 st1 {v7.8b}, [x0], x1 //row12
720 st1 {v9.8b}, [x0], x1 //row13
721 st1 {v11.8b}, [x0], x1 //row14
722 st1 {v13.8b}, [x0], x1 //row15
723 st1 {v15.8b}, [x0], x1 //row16
775 ld1 {v0.8b}, [x0], x1 //row1
776 ld1 {v2.8b}, [x0], x1 //row2
777 ld1 {v4.8b}, [x0], x1 //row3
778 ld1 {v6.8b}, [x0], x1 //row4
779 ld1 {v8.8b}, [x0], x1 //row5
780 ld1 {v10.8b}, [x0], x1 //row6
781 ld1 {v12.8b}, [x0], x1 //row7
782 ld1 {v14.8b}, [x0], x1 //row8
783 ld1 {v1.8b}, [x0], x1 //row9
784 ld1 {v3.8b}, [x0], x1 //row10
785 ld1 {v5.8b}, [x0], x1 //row11
786 ld1 {v7.8b}, [x0], x1 //row12
787 ld1 {v9.8b}, [x0], x1 //row13
788 ld1 {v11.8b}, [x0], x1 //row14
789 ld1 {v13.8b}, [x0], x1 //row15
790 ld1 {v15.8b}, [x0], x1 //row16
794 trn1 v21.8b, v0.8b, v2.8b
795 trn2 v2.8b, v0.8b, v2.8b //row1 &2
796 mov v0.8b, v21.8b
797 trn1 v21.8b, v4.8b, v6.8b
798 trn2 v6.8b, v4.8b, v6.8b //row3&row4
799 mov v4.8b, v21.8b
800 trn1 v21.8b, v8.8b, v10.8b
801 trn2 v10.8b, v8.8b, v10.8b //row5&6
802 mov v8.8b, v21.8b
803 trn1 v21.8b, v12.8b, v14.8b
804 trn2 v14.8b, v12.8b, v14.8b //row7 & 8
805 mov v12.8b, v21.8b
806 trn1 v21.8b, v1.8b, v3.8b
807 trn2 v3.8b, v1.8b, v3.8b //row9 &10
808 mov v1.8b , v21.8b
809 trn1 v21.8b, v5.8b, v7.8b
810 trn2 v7.8b, v5.8b, v7.8b //row11 & 12
811 mov v5.8b , v21.8b
812 trn1 v21.8b, v9.8b, v11.8b
813 trn2 v11.8b, v9.8b, v11.8b //row13 &14
814 mov v9.8b , v21.8b
815 trn1 v21.8b, v13.8b, v15.8b
816 trn2 v15.8b, v13.8b, v15.8b //row15 & 16
817 mov v13.8b , v21.8b
821 mov v2.8b, v21.8b
824 mov v10.8b , v21.8b
827 mov v3.8b, v21.8b
830 mov v11.8b, v21.8b
833 mov v6.8b, v21.8b
836 mov v7.8b, v21.8b
840 mov v0.8b , v21.8b
843 mov v8.8b, v21.8b
846 mov v1.8b, v21.8b
849 mov v9.8b , v21.8b
852 mov v0.8b, v21.8b
855 mov v1.8b, v21.8b
863 mov v4.8b, v21.8b
867 mov v5.8b, v21.8b
868 uaddl v16.8h, v6.8b, v8.8b //p0+q0 L
871 mov v2.8b, v21.8b
872 uaddl v18.8h, v7.8b, v9.8b //p0+q0 H
875 mov v3.8b, v21.8b
876 uaddw v20.8h, v16.8h , v4.8b //p0+q0+p1 L
877 uaddw v22.8h, v18.8h , v5.8b //p0+q0+p1 H
878 uaddl v24.8h, v2.8b, v10.8b //p2+q1 L
879 uaddl v26.8h, v3.8b, v11.8b //p2+q1 H
882 movi v28.16b, #2
883 uaddw v16.8h, v20.8h , v2.8b //p0+q0+p1+p2 L
884 uaddw v18.8h, v22.8h , v3.8b //p0+q0+p1+p2 H
885 dup v30.16b, w2 //duplicate alpha
886 rshrn v20.8b, v16.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
887 rshrn v21.8b, v18.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
897 uabd v22.16b , v6.16b, v8.16b
898 usra v28.16b, v30.16b, #2 //alpha >>2 +2
899 uabd v30.16b , v2.16b, v6.16b
900 rshrn v24.8b, v24.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
901 rshrn v25.8b, v26.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
903 dup v26.16b, w3 //beta
904 cmhi v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2)
905 uaddl v22.8h, v6.8b, v10.8b //p0+q1 L
906 cmhi v14.16b, v26.16b , v30.16b //beta>Ap
907 uaddl v30.8h, v7.8b, v11.8b //p0+q1 H
908 uaddw v22.8h, v22.8h , v4.8b //p0+q1+p1 L
909 uaddw v30.8h, v30.8h , v5.8b //p0+q1+p1 H
910 uaddw v22.8h, v22.8h , v4.8b //p0+q1+2*p1 L
911 uaddw v30.8h, v30.8h , v5.8b //p0+q1+2*p1 H
912 and v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
913 rshrn v22.8b, v22.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) L p0"
914 rshrn v23.8b, v30.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) H p0"
916 uaddl v30.8h, v2.8b, v0.8b //p2+p3 L
917 bif v24.16b, v22.16b , v14.16b //p0' or p0 "
918 uaddl v22.8h, v3.8b, v1.8b //p2+p3 H
923 uabd v30.16b , v12.16b, v8.16b
924 uabd v22.16b , v10.16b, v8.16b
925 rshrn v16.8b, v16.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
926 rshrn v17.8b, v18.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
928 uabd v18.16b , v4.16b, v6.16b
929 cmhi v30.16b, v26.16b , v30.16b //Aq < Beta
930 cmhs v22.16b, v22.16b, v26.16b
931 cmhs v18.16b, v18.16b, v26.16b
932 dup v26.16b, w2 //duplicate alpha
933 and v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
934 uabd v28.16b , v6.16b, v8.16b
935 orr v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
936 uaddl v18.8h, v6.8b, v8.8b //p0+q0 L
937 cmhs v28.16b, v28.16b, v26.16b
938 uaddl v26.8h, v7.8b, v9.8b //p0+q0 H
939 uaddw v18.8h, v18.8h , v10.8b //p0+q0+q1 L
940 …orr v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) …
941 uaddw v26.8h, v26.8h , v11.8b //p0+q0+q1 H
942 bic v14.16b, v14.16b , v22.16b //final condn for p's
943 movi v28.16b, #2
944 bif v6.16b, v24.16b , v22.16b //final p0
945 bit v2.16b, v16.16b , v14.16b //final p2
946 bif v20.16b, v4.16b , v14.16b //final p1
950 uaddl v24.8h, v8.8b, v4.8b //q0+p1 L
951 umlal v24.8h, v10.8b, v28.8b //X2(q1) + q0 + p1 L
952 uaddl v16.8h, v9.8b, v5.8b //q0+p1 H
953 umlal v16.8h, v11.8b, v28.8b //X2(q1) + q0 + p1 H
955 uaddl v14.8h, v4.8b, v12.8b //p1+q2 L
957 uaddl v4.8h, v5.8b, v13.8b //p1+q2H
959 rshrn v24.8b, v24.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; L q0'
960 rshrn v25.8b, v16.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; H q0'
962 uaddw v18.8h, v18.8h , v12.8b //p0 + q0 + q1 + q2 L
963 uaddw v26.8h, v26.8h , v13.8b //p0 + q0 + q1 + q2 H
964 rshrn v16.8b, v14.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
965 mov v14.16b, v31.16b
966 rshrn v17.8b, v4.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
968 rshrn v4.8b, v18.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 L q1'
969 rshrn v5.8b, v26.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 H q1'
971 bit v24.16b, v16.16b , v30.16b //q0' or q0"
972 bic v30.16b, v30.16b , v22.16b //final condn for q's
973 trn1 v31.8b, v0.8b, v2.8b
974 trn2 v2.8b, v0.8b, v2.8b //row1 &2
975 mov v0.8b, v31.8b
976 bit v10.16b, v4.16b , v30.16b
980 trn1 v31.8b, v1.8b, v3.8b
981 trn2 v3.8b, v1.8b, v3.8b //row9 &10
982 mov v1.8b, v31.8b
983 uaddl v16.8h, v12.8b, v14.8b //q2+q3 L
984 trn1 v31.8b, v20.8b, v6.8b
985 trn2 v6.8b, v20.8b, v6.8b //row3&row4
986 mov v20.8b , v31.8b
987 uaddl v4.8h, v13.8b, v15.8b //q2+q3 H
988 trn1 v31.8b, v21.8b, v7.8b
989 trn2 v7.8b, v21.8b, v7.8b //row11 & 12
990 mov v21.8b , v31.8b
994 mov v2.8b, v31.8b
998 mov v3.8b , v31.8b
999 bif v8.16b, v24.16b , v22.16b //final q0
1003 mov v0.8b , v31.8b
1004 rshrn v18.8b, v18.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
1007 mov v1.8b, v31.8b
1008 rshrn v19.8b, v26.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
1010 trn1 v31.8b, v8.8b, v10.8b
1011 trn2 v10.8b, v8.8b, v10.8b //row5&6
1012 mov v8.8b, v31.8b
1013 bit v12.16b, v18.16b , v30.16b //final q2
1015 trn1 v31.8b, v9.8b, v11.8b
1016 trn2 v11.8b, v9.8b, v11.8b //row13 &14
1017 mov v9.8b, v31.8b
1018 trn1 v31.8b, v12.8b, v14.8b
1019 trn2 v14.8b, v12.8b, v14.8b //row7 & 8
1020 mov v12.8b, v31.8b
1021 trn1 v31.8b, v13.8b, v15.8b
1022 trn2 v15.8b, v13.8b, v15.8b //row15 & 16
1023 mov v13.8b , v31.8b
1026 mov v10.8b, v31.8b
1029 mov v11.8b, v31.8b
1033 mov v8.8b, v31.8b
1036 mov v9.8b, v31.8b
1040 mov v6.8b , v31.8b
1043 mov v7.8b, v31.8b
1046 mov v0.8b , v31.8b
1049 mov v1.8b , v31.8b
1052 mov v2.8b , v31.8b
1055 mov v3.8b , v31.8b
1058 mov v20.8b , v31.8b
1061 mov v21.8b, v31.8b
1062 st1 {v0.8b}, [x0], x1 //row1
1063 st1 {v2.8b}, [x0], x1 //row2
1064 st1 {v20.8b}, [x0], x1 //row3
1065 st1 {v6.8b}, [x0], x1 //row4
1066 st1 {v8.8b}, [x0], x1 //row5
1067 st1 {v10.8b}, [x0], x1 //row6
1068 st1 {v12.8b}, [x0], x1 //row7
1069 st1 {v14.8b}, [x0], x1 //row8
1070 st1 {v1.8b}, [x0], x1 //row9
1071 st1 {v3.8b}, [x0], x1 //row10
1072 st1 {v21.8b}, [x0], x1 //row11
1073 st1 {v7.8b}, [x0], x1 //row12
1074 st1 {v9.8b}, [x0], x1 //row13
1075 st1 {v11.8b}, [x0], x1 //row14
1076 st1 {v13.8b}, [x0], x1 //row15
1077 st1 {v15.8b}, [x0], x1 //row16