Lines Matching refs:b

100     ld1       {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5
103 ld1 {v8.8b, v9.8b}, [x0], x1 //p1 values are loaded into q4
105 ld1 {v6.8b, v7.8b}, [x0], x1 //p0 values are loaded into q3
106 uxtl v12.8h, v12.8b //q6 = uc_Bs in each 16 bt scalar
107 ld1 {v0.8b, v1.8b}, [x0], x1 //q0 values are loaded into q0
111 uabd v26.16b, v8.16b, v6.16b
112 ld1 {v2.8b, v3.8b}, [x0], x1 //q1 values are loaded into q1
115 uabd v22.16b, v6.16b, v0.16b
117 uabd v24.16b, v2.16b, v0.16b
118 ld1 {v4.8b, v5.8b}, [x0], x1 //q2 values are loaded into q2
119 tbl v14.8b, {v16.16b}, v12.8b //
121 dup v20.16b, w2 //Q10 contains alpha
122 dup v16.16b, w3 //Q8 contains beta
125 uabd v28.16b, v10.16b, v6.16b
126 uabd v30.16b, v4.16b, v0.16b
129 cmhs v18.16b, v22.16b, v20.16b
130 cmhs v24.16b, v24.16b, v16.16b
131 cmhs v26.16b, v26.16b, v16.16b
132 cmhi v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta)
133 cmhi v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta)
135 orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
136 usubl v30.8h, v1.8b, v7.8b //
137 usubl v24.8h, v0.8b, v6.8b //Q15,Q12 = (q0 - p0)
138 …orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) |…
139 usubl v28.8h, v8.8b, v2.8b //Q14 = (p1 - q1)L
142 usubl v30.8h, v9.8b, v3.8b //Q15 = (p1 - q1)H
143 bic v12.16b, v12.16b , v18.16b //final condition
146 sub v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta)
147 urhadd v16.16b, v6.16b , v0.16b //Q8 = ((p0+q0+1) >> 1)
149 sqrshrn v24.8b, v24.8h, #3 //
150 sqrshrn v25.8b, v26.8h, #3 //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
152 sub v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta)
153 and v20.16b, v20.16b , v12.16b //
154 and v22.16b, v22.16b , v12.16b //
155 abs v26.16b, v24.16b //Q13 = ABS (i_macro)
156 uaddl v28.8h, v17.8b, v11.8b //
157 uaddl v10.8h, v16.8b, v10.8b //Q14,Q5 = p2 + (p0+q0+1)>>1
158 uaddl v30.8h, v17.8b, v5.8b //
159 umin v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
160 ushll v26.8h, v9.8b, #1 //
161 uaddl v4.8h, v16.8b, v4.8b //Q15,Q2 = q2 + (p0+q0+1)>>1
162 ushll v16.8h, v8.8b, #1 //Q13,Q8 = (p1<<1)
163 … and v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd
166 ushll v16.8h, v2.8b, #1 //
167 ushll v26.8h, v3.8b, #1 //Q13,Q8 = (q1<<1)
168 sqshrn v29.8b, v28.8h, #1 //
169 sqshrn v28.8b, v10.8h, #1 //Q14 = i_macro_p1
173 neg v26.16b, v14.16b //Q13 = -C0
174 smin v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1)
175 cmge v24.16b, v24.16b, #0
176 sqshrn v31.8b, v30.8h, #1 //
177 sqshrn v30.8b, v4.8h, #1 //Q15 = i_macro_q1
179 smax v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) )
180 uqadd v16.16b, v6.16b , v18.16b //Q8 = p0 + delta
181 uqsub v6.16b, v6.16b , v18.16b //Q3 = p0 - delta
182 smin v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1)
183 and v28.16b, v20.16b , v28.16b //condition check Ap<beta
184 uqadd v14.16b, v0.16b , v18.16b //Q7 = q0 + delta
185 uqsub v0.16b, v0.16b , v18.16b //Q0 = q0 - delta
186 smax v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) )
187 bif v16.16b, v6.16b , v24.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
188 bif v0.16b, v14.16b , v24.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
189 add v28.16b, v28.16b , v8.16b //
190 and v30.16b, v22.16b , v30.16b //condition check Aq<beta
191 st1 {v16.16b}, [x7], x1 //writting back filtered value of p0
192 add v30.16b, v30.16b , v2.16b //
193 st1 {v0.16b}, [x7], x1 //writting back filtered value of q0
194 st1 {v28.16b}, [x6] //writting back filtered value of p1
195 st1 {v30.16b}, [x7], x1 //writting back filtered value of q1
247 dup v0.16b, w2 //duplicate alpha
249 dup v2.16b, w3 //duplicate beta
255 ld1 {v4.8b, v5.8b}, [x0], x1 //load q0 to Q2, q0 = q0 + src_strd
256 ld1 {v6.8b, v7.8b}, [x12] //load p0 to Q3
257 ld1 {v8.8b, v9.8b}, [x0], x1 //load q1 to Q4, q0 = q0 + src_strd
258 ld1 {v10.8b, v11.8b}, [x14] //load p1 to Q5
265 uabd v12.16b , v4.16b, v6.16b
266 uabd v14.16b , v8.16b, v4.16b
267 uabd v16.16b , v10.16b, v6.16b
268 cmhs v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha
269 cmhs v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta
270 cmhs v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta
271 movi v20.16b, #2
272 orr v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
273 ld1 {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd
275 …orr v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p…
276 usra v20.16b, v0.16b, #2 //alpha >>2 +2
277 uabd v22.16b , v14.16b, v4.16b
278 uaddl v24.8h, v4.8b, v6.8b //p0+q0 L
279 uaddl v26.8h, v5.8b, v7.8b //p0+q0 H
280 cmhi v22.16b, v2.16b , v22.16b //Aq < Beta
281 cmhi v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2))
283 uaddw v28.8h, v24.8h , v8.8b //p0+q0+q1 L
284 uaddw v30.8h, v26.8h , v9.8b //p0+q0+q1 H
285 and v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
289 uaddw v16.8h, v16.8h , v14.8b //2*(p0+q0+q1)+q2 L
290 uaddw v0.8h, v0.8h , v15.8b //2*(p0+q0+q1)+q2 H
291 uaddw v16.8h, v16.8h , v10.8b //2*(p0+q0+q1)+q2 +p1 L
292 uaddw v0.8h, v0.8h , v11.8b //2*(p0+q0+q1)+q2 +p1 H
293 rshrn v12.8b, v16.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
294 rshrn v13.8b, v0.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
297 uaddl v16.8h, v8.8b, v8.8b //2*q1 L
298 uaddl v0.8h, v9.8b, v9.8b //2*q1 H
299 uaddw v16.8h, v16.8h , v4.8b //2*q1+q0 L
300 uaddw v0.8h, v0.8h , v5.8b //2*q1+q0 H
301 uaddw v16.8h, v16.8h , v10.8b //2*q1+q0+p1 L
302 uaddw v0.8h, v0.8h , v11.8b //2*q1+q0+p1 H
303 rshrn v16.8b, v16.8h, #2 //(2*q1+q0+p1+2)>>2 L [q0"]
304 rshrn v17.8b, v0.8h, #2 //(2*q1+q0+p1+2)>>2 H [q0"]
306 uaddw v28.8h, v28.8h , v14.8b //p0+q0+q1+q2 L
307 uaddw v30.8h, v30.8h , v15.8b //p0+q0+q1+q2 H
308 ld1 {v0.8b, v1.8b}, [x0], x1 //load q3 to Q0, q0 = q0 + src_strd
310 bit v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn
312 …bic v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 …
314 rshrn v12.8b, v28.8h, #2 //(p0+q0+q1+q2+2)>>2 L [q1']
315 rshrn v13.8b, v30.8h, #2 //(p0+q0+q1+q2+2)>>2 H [q1']
317 bif v4.16b, v16.16b , v18.16b //choose q0 or filtered q0
319 uaddl v16.8h, v14.8b, v0.8b //q2+q3,L
320 uaddl v0.8h, v15.8b, v1.8b //q2+q3,H
322 st1 {v4.8b, v5.8b}, [x0], x1 //store q0
326 rshrn v0.8b, v28.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
327 rshrn v1.8b, v30.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
329 ld1 {v30.8b, v31.8b}, [x3] //load p2 to Q15
331 bif v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1
333 uabd v16.16b , v30.16b, v6.16b
334 uaddw v24.8h, v24.8h , v10.8b //p0+q0+p1 L
335 bif v0.16b, v14.16b , v22.16b //choose q2 or filtered q2
337 uaddw v26.8h, v26.8h , v11.8b //p0+q0+p1 H
338 st1 {v12.8b, v13.8b}, [x0], x1 //store q1
339 cmhi v16.16b, v2.16b , v16.16b //Ap < Beta
342 st1 {v0.8b, v1.8b}, [x0], x1 //store q2
343 and v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
344 uaddw v28.8h, v28.8h , v30.8b //2*(p0+q0+p1)+p2 l
345 uaddw v4.8h, v4.8h , v31.8b //2*(p0+q0+p1)+p2 H
346 uaddw v28.8h, v28.8h , v8.8b //2*(p0+q0+p1)+p2+q1 L
347 uaddw v4.8h, v4.8h , v9.8b //2*(p0+q0+p1)+p2+q1 H
348 rshrn v28.8b, v28.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0'
349 rshrn v29.8b, v4.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0'
351 movi v0.8b, #2
353 uaddl v2.8h, v6.8b, v8.8b //p0+q1 L
354 umlal v2.8h, v10.8b, v0.8b //2*p1+p0+q1 L
355 uaddl v16.8h, v7.8b, v9.8b //p0+q1 H
356 umlal v16.8h, v11.8b, v0.8b //2*p1+p0+q1 H
357 uaddw v12.8h, v24.8h , v30.8b //(p0+q0+p1) +p2 L
358 ld1 {v24.8b, v25.8b}, [x2] //load p3,Q12
360 uaddw v4.8h, v26.8h , v31.8b //(p0+q0+p1) +p2 H
361 uaddl v8.8h, v30.8b, v24.8b //p2+p3 L
362 rshrn v26.8b, v12.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' L
363 rshrn v2.8b, v2.8h, #2 //(2*p1+p0+q1+2)>>2,p0"L
364 rshrn v27.8b, v4.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' H
365 rshrn v3.8b, v16.8h, #2 //(2*p1+p0+q1+2)>>2,p0" H
368 uaddl v16.8h, v31.8b, v25.8b //p2+p3 H
371 …bic v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 …
373 bit v2.16b, v28.16b , v20.16b //choosing between po' and p0"
375 rshrn v12.8b, v12.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
376 rshrn v13.8b, v4.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
378 bif v6.16b, v2.16b , v18.16b //choosing between p0 and filtered value of p0
379 bit v10.16b, v26.16b , v16.16b //choosing between p1 and p1'
380 bit v30.16b, v12.16b , v16.16b //choosing between p2 and p2'
381 st1 {v6.16b}, [x12] //store p0
382 st1 {v10.16b}, [x14] //store p1
383 st1 {v30.16b}, [x3] //store p2
444 ld1 {v0.8b}, [x0], x1 //row1
445 ld1 {v2.8b}, [x0], x1 //row2
446 ld1 {v4.8b}, [x0], x1 //row3
448 ld1 {v6.8b}, [x0], x1 //row4
451 ld1 {v8.8b}, [x0], x1 //row5
452 uxtl v18.8h, v18.8b //q6 = uc_Bs in each 16 bt scalar
453 ld1 {v10.8b}, [x0], x1 //row6
454 ld1 {v12.8b}, [x0], x1 //row7
455 tbl v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs]
456 ld1 {v14.8b}, [x0], x1 //row8
457 ld1 {v1.8b}, [x0], x1 //row9
459 ld1 {v3.8b}, [x0], x1 //row10
460 ld1 {v5.8b}, [x0], x1 //row11
461 ld1 {v7.8b}, [x0], x1 //row12
463 ld1 {v9.8b}, [x0], x1 //row13
464 ld1 {v11.8b}, [x0], x1 //row14
465 ld1 {v13.8b}, [x0], x1 //row15
467 ld1 {v15.8b}, [x0], x1 //row16
472 trn1 v21.8b, v0.8b, v2.8b
473 trn2 v2.8b, v0.8b, v2.8b //row1 &2
474 mov v0.8b, v21.8b
475 trn1 v21.8b, v4.8b, v6.8b
476 trn2 v6.8b, v4.8b, v6.8b //row3&row4
477 mov v4.8b, v21.8b
478 trn1 v21.8b, v8.8b, v10.8b
479 trn2 v10.8b, v8.8b, v10.8b //row5&6
480 mov v8.8b, v21.8b
481 trn1 v21.8b, v12.8b, v14.8b
482 trn2 v14.8b, v12.8b, v14.8b //row7 & 8
483 mov v12.8b, v21.8b
484 trn1 v21.8b, v1.8b, v3.8b
485 trn2 v3.8b, v1.8b, v3.8b //row9 &10
486 mov v1.8b, v21.8b
487 trn1 v21.8b, v5.8b, v7.8b
488 trn2 v7.8b, v5.8b, v7.8b //row11 & 12
489 mov v5.8b, v21.8b
490 trn1 v21.8b, v9.8b, v11.8b
491 trn2 v11.8b, v9.8b, v11.8b //row13 &14
492 mov v9.8b, v21.8b
493 trn1 v21.8b, v13.8b, v15.8b
494 trn2 v15.8b, v13.8b, v15.8b //row15 & 16
495 mov v13.8b, v21.8b
499 mov v2.8b, v21.8b
502 mov v10.8b, v21.8b
505 mov v3.8b, v21.8b
508 mov v11.8b, v21.8b
511 mov v6.8b, v21.8b
514 mov v7.8b, v21.8b
518 mov v0.8b, v21.8b
521 mov v8.8b, v21.8b
524 mov v1.8b, v21.8b
527 mov v9.8b, v21.8b
530 mov v0.8b, v21.8b
533 mov v1.8b, v21.8b
538 mov v2.8b, v21.8b
541 urhadd v20.16b, v6.16b , v8.16b //((p0 + q0 + 1) >> 1)
545 mov v3.8b, v31.8b
546 movi v19.8b, #2
551 mov v4.8b, v31.8b
552 uabd v22.16b , v6.16b, v8.16b //ABS(q1 - q0)
555 mov v5.8b, v31.8b
562 uaddl v24.8h, v20.8b, v2.8b //(p2 + ((p0 + q0 + 1) >> 1) L
564 uaddl v26.8h, v21.8b, v3.8b //(p2 + ((p0 + q0 + 1) >> 1) H
565 umlsl v24.8h, v4.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
566 umlsl v26.8h, v5.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
567 dup v28.16b, w2 //alpha
568 cmhs v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
569 dup v28.16b, w3 //beta
570 uabd v30.16b , v10.16b, v8.16b //ABS(q1 - q0)
571 sqshrn v24.8b, v24.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
572 sqshrn v25.8b, v26.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
574 cmhs v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
575 uabd v26.16b , v4.16b, v6.16b //ABS(q1 - q0)
577 smin v24.16b, v24.16b , v16.16b //min(deltap1 ,C0)
578 orr v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
579 neg v30.16b, v16.16b //-C0
580 cmhs v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
581 smax v24.16b, v24.16b , v30.16b //max(deltap1,-C0)
582 …orr v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1…
584 uaddl v18.8h, v20.8b, v12.8b //q2 + ((p0 + q0 + 1) >> 1) L
586 usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) L
587 uaddl v20.8h, v21.8b, v13.8b //q2 + ((p0 + q0 + 1) >> 1) H
588 usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
589 usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) H
590 …orr v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 -…
591 usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
592 sqshrn v18.8b, v18.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
593 uabd v22.16b , v2.16b, v6.16b //ABS(q1 - q0)
594 sqshrn v19.8b, v20.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
596 uabd v20.16b , v12.16b, v8.16b //ABS(q1 - q0)
597 cmhi v22.16b, v28.16b , v22.16b //Ap < Beta
598 smin v18.16b, v18.16b , v16.16b //min(delatq1,C0)
599 cmhi v20.16b, v28.16b , v20.16b //Aq <Beta
600 usubl v28.8h, v8.8b, v6.8b //(q0 - p0) L
601 smax v18.16b, v18.16b , v30.16b //max(deltaq1,-C0)
602 usubl v30.8h, v9.8b, v7.8b //(q0 - p0) H
604 sub v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta)
606 uaddw v28.8h, v28.8h , v4.8b //((q0 - p0) << 2) + (p1 L
607 uaddw v30.8h, v30.8h , v5.8b //((q0 - p0) << 2) + (p1 H
608 usubw v28.8h, v28.8h , v10.8b //((q0 - p0) << 2) + (p1 - q1) L
609 usubw v30.8h, v30.8h , v11.8b //((q0 - p0) << 2) + (p1 - q1) H
610 bic v22.16b, v22.16b , v26.16b //final condition for p1
611 rshrn v28.8b, v28.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
612 rshrn v29.8b, v30.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
614 sub v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta)
615 bic v20.16b, v20.16b , v26.16b //final condition for q1
616 abs v30.16b, v28.16b //abs(delta)
617 and v24.16b, v24.16b , v22.16b //delatp1
618 and v18.16b, v18.16b , v20.16b //delta q1
619 umin v30.16b, v30.16b , v16.16b //min((abs(delta),C)
620 add v4.16b, v4.16b , v24.16b //p1+deltap1
621 add v10.16b, v10.16b , v18.16b //q1+deltaq1
624 bic v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only
626 cmge v28.16b, v28.16b , #0
627 uqsub v22.16b, v6.16b , v30.16b //clip(p0-delta)
629 trn1 v21.8b, v0.8b, v2.8b
630 trn2 v2.8b, v0.8b, v2.8b //row1 &2
631 mov v0.8b, v21.8b
632 uqadd v6.16b, v6.16b , v30.16b //clip(p0+delta)
634 trn1 v21.8b, v1.8b, v3.8b
635 trn2 v3.8b, v1.8b, v3.8b //row9 &10
636 mov v1.8b, v21.8b
637 uqadd v24.16b, v8.16b , v30.16b //clip(q0+delta)
638 trn1 v21.8b, v12.8b, v14.8b
639 trn2 v14.8b, v12.8b, v14.8b //row7 & 8
640 mov v12.8b, v21.8b
641 uqsub v8.16b, v8.16b , v30.16b //clip(q0-delta)
642 trn1 v21.8b, v13.8b, v15.8b
643 trn2 v15.8b, v13.8b, v15.8b //row15 & 16
644 mov v13.8b, v21.8b
645 bif v6.16b, v22.16b , v28.16b //p0
646 bif v8.16b, v24.16b , v28.16b //q0
649 trn1 v21.8b, v4.8b, v6.8b
650 trn2 v6.8b, v4.8b, v6.8b //row3&row4
651 mov v4.8b, v21.8b
652 trn1 v21.8b, v8.8b, v10.8b
653 trn2 v10.8b, v8.8b, v10.8b //row5&6
654 mov v8.8b, v21.8b
655 trn1 v21.8b, v5.8b, v7.8b
656 trn2 v7.8b, v5.8b, v7.8b //row11 & 12
657 mov v5.8b, v21.8b
658 trn1 v21.8b, v9.8b, v11.8b
659 trn2 v11.8b, v9.8b, v11.8b //row13 &14
660 mov v9.8b, v21.8b
663 mov v2.8b, v21.8b
666 mov v10.8b, v21.8b
669 mov v3.8b, v21.8b
672 mov v11.8b, v21.8b
675 mov v6.8b, v21.8b
678 mov v7.8b, v21.8b
682 mov v0.8b, v21.8b
685 mov v8.8b, v21.8b
688 mov v1.8b, v21.8b
691 mov v9.8b, v21.8b
695 mov v0.8b, v21.8b
698 mov v1.8b, v21.8b
701 mov v2.8b, v21.8b
704 mov v3.8b, v21.8b
707 mov v4.8b, v21.8b
710 mov v5.8b, v21.8b
711 st1 {v0.8b}, [x0], x1 //row1
712 st1 {v2.8b}, [x0], x1 //row2
713 st1 {v4.8b}, [x0], x1 //row3
714 st1 {v6.8b}, [x0], x1 //row4
715 st1 {v8.8b}, [x0], x1 //row5
716 st1 {v10.8b}, [x0], x1 //row6
717 st1 {v12.8b}, [x0], x1 //row7
718 st1 {v14.8b}, [x0], x1 //row8
719 st1 {v1.8b}, [x0], x1 //row9
720 st1 {v3.8b}, [x0], x1 //row10
721 st1 {v5.8b}, [x0], x1 //row11
722 st1 {v7.8b}, [x0], x1 //row12
723 st1 {v9.8b}, [x0], x1 //row13
724 st1 {v11.8b}, [x0], x1 //row14
725 st1 {v13.8b}, [x0], x1 //row15
726 st1 {v15.8b}, [x0], x1 //row16
778 ld1 {v0.8b}, [x0], x1 //row1
779 ld1 {v2.8b}, [x0], x1 //row2
780 ld1 {v4.8b}, [x0], x1 //row3
781 ld1 {v6.8b}, [x0], x1 //row4
782 ld1 {v8.8b}, [x0], x1 //row5
783 ld1 {v10.8b}, [x0], x1 //row6
784 ld1 {v12.8b}, [x0], x1 //row7
785 ld1 {v14.8b}, [x0], x1 //row8
786 ld1 {v1.8b}, [x0], x1 //row9
787 ld1 {v3.8b}, [x0], x1 //row10
788 ld1 {v5.8b}, [x0], x1 //row11
789 ld1 {v7.8b}, [x0], x1 //row12
790 ld1 {v9.8b}, [x0], x1 //row13
791 ld1 {v11.8b}, [x0], x1 //row14
792 ld1 {v13.8b}, [x0], x1 //row15
793 ld1 {v15.8b}, [x0], x1 //row16
797 trn1 v21.8b, v0.8b, v2.8b
798 trn2 v2.8b, v0.8b, v2.8b //row1 &2
799 mov v0.8b, v21.8b
800 trn1 v21.8b, v4.8b, v6.8b
801 trn2 v6.8b, v4.8b, v6.8b //row3&row4
802 mov v4.8b, v21.8b
803 trn1 v21.8b, v8.8b, v10.8b
804 trn2 v10.8b, v8.8b, v10.8b //row5&6
805 mov v8.8b, v21.8b
806 trn1 v21.8b, v12.8b, v14.8b
807 trn2 v14.8b, v12.8b, v14.8b //row7 & 8
808 mov v12.8b, v21.8b
809 trn1 v21.8b, v1.8b, v3.8b
810 trn2 v3.8b, v1.8b, v3.8b //row9 &10
811 mov v1.8b , v21.8b
812 trn1 v21.8b, v5.8b, v7.8b
813 trn2 v7.8b, v5.8b, v7.8b //row11 & 12
814 mov v5.8b , v21.8b
815 trn1 v21.8b, v9.8b, v11.8b
816 trn2 v11.8b, v9.8b, v11.8b //row13 &14
817 mov v9.8b , v21.8b
818 trn1 v21.8b, v13.8b, v15.8b
819 trn2 v15.8b, v13.8b, v15.8b //row15 & 16
820 mov v13.8b , v21.8b
824 mov v2.8b, v21.8b
827 mov v10.8b , v21.8b
830 mov v3.8b, v21.8b
833 mov v11.8b, v21.8b
836 mov v6.8b, v21.8b
839 mov v7.8b, v21.8b
843 mov v0.8b , v21.8b
846 mov v8.8b, v21.8b
849 mov v1.8b, v21.8b
852 mov v9.8b , v21.8b
855 mov v0.8b, v21.8b
858 mov v1.8b, v21.8b
866 mov v4.8b, v21.8b
870 mov v5.8b, v21.8b
871 uaddl v16.8h, v6.8b, v8.8b //p0+q0 L
874 mov v2.8b, v21.8b
875 uaddl v18.8h, v7.8b, v9.8b //p0+q0 H
878 mov v3.8b, v21.8b
879 uaddw v20.8h, v16.8h , v4.8b //p0+q0+p1 L
880 uaddw v22.8h, v18.8h , v5.8b //p0+q0+p1 H
881 uaddl v24.8h, v2.8b, v10.8b //p2+q1 L
882 uaddl v26.8h, v3.8b, v11.8b //p2+q1 H
885 movi v28.16b, #2
886 uaddw v16.8h, v20.8h , v2.8b //p0+q0+p1+p2 L
887 uaddw v18.8h, v22.8h , v3.8b //p0+q0+p1+p2 H
888 dup v30.16b, w2 //duplicate alpha
889 rshrn v20.8b, v16.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
890 rshrn v21.8b, v18.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
900 uabd v22.16b , v6.16b, v8.16b
901 usra v28.16b, v30.16b, #2 //alpha >>2 +2
902 uabd v30.16b , v2.16b, v6.16b
903 rshrn v24.8b, v24.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
904 rshrn v25.8b, v26.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
906 dup v26.16b, w3 //beta
907 cmhi v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2)
908 uaddl v22.8h, v6.8b, v10.8b //p0+q1 L
909 cmhi v14.16b, v26.16b , v30.16b //beta>Ap
910 uaddl v30.8h, v7.8b, v11.8b //p0+q1 H
911 uaddw v22.8h, v22.8h , v4.8b //p0+q1+p1 L
912 uaddw v30.8h, v30.8h , v5.8b //p0+q1+p1 H
913 uaddw v22.8h, v22.8h , v4.8b //p0+q1+2*p1 L
914 uaddw v30.8h, v30.8h , v5.8b //p0+q1+2*p1 H
915 and v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
916 rshrn v22.8b, v22.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) L p0"
917 rshrn v23.8b, v30.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) H p0"
919 uaddl v30.8h, v2.8b, v0.8b //p2+p3 L
920 bif v24.16b, v22.16b , v14.16b //p0' or p0 "
921 uaddl v22.8h, v3.8b, v1.8b //p2+p3 H
926 uabd v30.16b , v12.16b, v8.16b
927 uabd v22.16b , v10.16b, v8.16b
928 rshrn v16.8b, v16.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
929 rshrn v17.8b, v18.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
931 uabd v18.16b , v4.16b, v6.16b
932 cmhi v30.16b, v26.16b , v30.16b //Aq < Beta
933 cmhs v22.16b, v22.16b, v26.16b
934 cmhs v18.16b, v18.16b, v26.16b
935 dup v26.16b, w2 //duplicate alpha
936 and v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
937 uabd v28.16b , v6.16b, v8.16b
938 orr v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
939 uaddl v18.8h, v6.8b, v8.8b //p0+q0 L
940 cmhs v28.16b, v28.16b, v26.16b
941 uaddl v26.8h, v7.8b, v9.8b //p0+q0 H
942 uaddw v18.8h, v18.8h , v10.8b //p0+q0+q1 L
943 …orr v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) …
944 uaddw v26.8h, v26.8h , v11.8b //p0+q0+q1 H
945 bic v14.16b, v14.16b , v22.16b //final condn for p's
946 movi v28.16b, #2
947 bif v6.16b, v24.16b , v22.16b //final p0
948 bit v2.16b, v16.16b , v14.16b //final p2
949 bif v20.16b, v4.16b , v14.16b //final p1
953 uaddl v24.8h, v8.8b, v4.8b //q0+p1 L
954 umlal v24.8h, v10.8b, v28.8b //X2(q1) + q0 + p1 L
955 uaddl v16.8h, v9.8b, v5.8b //q0+p1 H
956 umlal v16.8h, v11.8b, v28.8b //X2(q1) + q0 + p1 H
958 uaddl v14.8h, v4.8b, v12.8b //p1+q2 L
960 uaddl v4.8h, v5.8b, v13.8b //p1+q2H
962 rshrn v24.8b, v24.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; L q0'
963 rshrn v25.8b, v16.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; H q0'
965 uaddw v18.8h, v18.8h , v12.8b //p0 + q0 + q1 + q2 L
966 uaddw v26.8h, v26.8h , v13.8b //p0 + q0 + q1 + q2 H
967 rshrn v16.8b, v14.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
968 mov v14.16b, v31.16b
969 rshrn v17.8b, v4.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
971 rshrn v4.8b, v18.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 L q1'
972 rshrn v5.8b, v26.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 H q1'
974 bit v24.16b, v16.16b , v30.16b //q0' or q0"
975 bic v30.16b, v30.16b , v22.16b //final condn for q's
976 trn1 v31.8b, v0.8b, v2.8b
977 trn2 v2.8b, v0.8b, v2.8b //row1 &2
978 mov v0.8b, v31.8b
979 bit v10.16b, v4.16b , v30.16b
983 trn1 v31.8b, v1.8b, v3.8b
984 trn2 v3.8b, v1.8b, v3.8b //row9 &10
985 mov v1.8b, v31.8b
986 uaddl v16.8h, v12.8b, v14.8b //q2+q3 L
987 trn1 v31.8b, v20.8b, v6.8b
988 trn2 v6.8b, v20.8b, v6.8b //row3&row4
989 mov v20.8b , v31.8b
990 uaddl v4.8h, v13.8b, v15.8b //q2+q3 H
991 trn1 v31.8b, v21.8b, v7.8b
992 trn2 v7.8b, v21.8b, v7.8b //row11 & 12
993 mov v21.8b , v31.8b
997 mov v2.8b, v31.8b
1001 mov v3.8b , v31.8b
1002 bif v8.16b, v24.16b , v22.16b //final q0
1006 mov v0.8b , v31.8b
1007 rshrn v18.8b, v18.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
1010 mov v1.8b, v31.8b
1011 rshrn v19.8b, v26.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
1013 trn1 v31.8b, v8.8b, v10.8b
1014 trn2 v10.8b, v8.8b, v10.8b //row5&6
1015 mov v8.8b, v31.8b
1016 bit v12.16b, v18.16b , v30.16b //final q2
1018 trn1 v31.8b, v9.8b, v11.8b
1019 trn2 v11.8b, v9.8b, v11.8b //row13 &14
1020 mov v9.8b, v31.8b
1021 trn1 v31.8b, v12.8b, v14.8b
1022 trn2 v14.8b, v12.8b, v14.8b //row7 & 8
1023 mov v12.8b, v31.8b
1024 trn1 v31.8b, v13.8b, v15.8b
1025 trn2 v15.8b, v13.8b, v15.8b //row15 & 16
1026 mov v13.8b , v31.8b
1029 mov v10.8b, v31.8b
1032 mov v11.8b, v31.8b
1036 mov v8.8b, v31.8b
1039 mov v9.8b, v31.8b
1043 mov v6.8b , v31.8b
1046 mov v7.8b, v31.8b
1049 mov v0.8b , v31.8b
1052 mov v1.8b , v31.8b
1055 mov v2.8b , v31.8b
1058 mov v3.8b , v31.8b
1061 mov v20.8b , v31.8b
1064 mov v21.8b, v31.8b
1065 st1 {v0.8b}, [x0], x1 //row1
1066 st1 {v2.8b}, [x0], x1 //row2
1067 st1 {v20.8b}, [x0], x1 //row3
1068 st1 {v6.8b}, [x0], x1 //row4
1069 st1 {v8.8b}, [x0], x1 //row5
1070 st1 {v10.8b}, [x0], x1 //row6
1071 st1 {v12.8b}, [x0], x1 //row7
1072 st1 {v14.8b}, [x0], x1 //row8
1073 st1 {v1.8b}, [x0], x1 //row9
1074 st1 {v3.8b}, [x0], x1 //row10
1075 st1 {v21.8b}, [x0], x1 //row11
1076 st1 {v7.8b}, [x0], x1 //row12
1077 st1 {v9.8b}, [x0], x1 //row13
1078 st1 {v11.8b}, [x0], x1 //row14
1079 st1 {v13.8b}, [x0], x1 //row15
1080 st1 {v15.8b}, [x0], x1 //row16