Lines Matching refs:b
120 movi v17.16b, #0x0f
151 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
152 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
153 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
155 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
156 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
157 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
158 b .Lenc_entry
164 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
166 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
167 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
168 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
169 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
170 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
172 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
173 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
174 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
175 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
176 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
177 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
179 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
184 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
185 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
186 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
187 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
188 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
189 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
190 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
191 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
192 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
193 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
194 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
195 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
203 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
205 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
206 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
207 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
208 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
220 ld1 {v7.16b}, [x0]
223 st1 {v0.16b}, [x1]
238 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
239 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
240 and v9.16b, v15.16b, v17.16b
241 ushr v8.16b, v15.16b, #4
242 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
243 tbl v9.16b, {v20.16b}, v9.16b
245 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
246 tbl v10.16b, {v21.16b}, v8.16b
247 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
248 eor v8.16b, v9.16b, v16.16b
249 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
250 eor v8.16b, v8.16b, v10.16b
251 b .Lenc_2x_entry
257 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
258 tbl v12.16b, {v25.16b}, v10.16b
260 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
261 tbl v8.16b, {v24.16b}, v11.16b
262 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
263 eor v12.16b, v12.16b, v16.16b
264 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
265 tbl v13.16b, {v27.16b}, v10.16b
266 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
267 eor v8.16b, v8.16b, v12.16b
268 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
269 tbl v10.16b, {v26.16b}, v11.16b
271 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
272 tbl v11.16b, {v8.16b}, v1.16b
273 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
274 eor v10.16b, v10.16b, v13.16b
275 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
276 tbl v8.16b, {v8.16b}, v4.16b
277 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
278 eor v11.16b, v11.16b, v10.16b
279 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
280 tbl v12.16b, {v11.16b},v1.16b
281 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
282 eor v8.16b, v8.16b, v11.16b
284 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
285 eor v8.16b, v8.16b, v12.16b
290 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
291 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
292 and v9.16b, v8.16b, v17.16b
293 ushr v8.16b, v8.16b, #4
294 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
295 tbl v13.16b, {v19.16b},v9.16b
296 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
297 eor v9.16b, v9.16b, v8.16b
298 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
299 tbl v11.16b, {v18.16b},v8.16b
300 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
301 tbl v12.16b, {v18.16b},v9.16b
302 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
303 eor v11.16b, v11.16b, v13.16b
304 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
305 eor v12.16b, v12.16b, v13.16b
306 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
307 tbl v10.16b, {v18.16b},v11.16b
308 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
309 tbl v11.16b, {v18.16b},v12.16b
310 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
311 eor v10.16b, v10.16b, v9.16b
312 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
313 eor v11.16b, v11.16b, v8.16b
321 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
322 tbl v12.16b, {v22.16b}, v10.16b
324 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
325 tbl v8.16b, {v23.16b}, v11.16b
326 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
327 eor v12.16b, v12.16b, v16.16b
328 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
329 eor v8.16b, v8.16b, v12.16b
330 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
331 tbl v1.16b, {v8.16b},v1.16b
340 movi v17.16b, #0x0f
372 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
373 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
374 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
377 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
378 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
379 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
380 b .Ldec_entry
389 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
390 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
391 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
393 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
396 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
397 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
398 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
399 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
401 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
404 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
405 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
406 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
407 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
409 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
412 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
413 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
414 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
415 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
416 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
417 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
422 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
423 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
424 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
425 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
426 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
427 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
428 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
429 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
430 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
431 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
432 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
433 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
439 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
442 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
443 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
444 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
445 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
457 ld1 {v7.16b}, [x0]
460 st1 {v0.16b}, [x1]
484 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
485 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
486 and v9.16b, v15.16b, v17.16b
487 ushr v8.16b, v15.16b, #4
488 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
489 tbl v10.16b, {v20.16b},v9.16b
492 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
493 tbl v8.16b, {v21.16b},v8.16b
494 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
495 eor v10.16b, v10.16b, v16.16b
496 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
497 eor v8.16b, v8.16b, v10.16b
498 b .Ldec_2x_entry
507 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
508 tbl v12.16b, {v24.16b}, v10.16b
509 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
510 tbl v9.16b, {v25.16b}, v11.16b
511 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
512 eor v8.16b, v12.16b, v16.16b
514 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
515 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
518 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
519 tbl v12.16b, {v26.16b}, v10.16b
520 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
521 tbl v8.16b, {v8.16b},v5.16b
522 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
523 tbl v9.16b, {v27.16b}, v11.16b
524 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
525 eor v8.16b, v8.16b, v12.16b
527 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
528 eor v8.16b, v8.16b, v9.16b
531 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
532 tbl v12.16b, {v28.16b}, v10.16b
533 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
534 tbl v8.16b, {v8.16b},v5.16b
535 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
536 tbl v9.16b, {v29.16b}, v11.16b
537 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
538 eor v8.16b, v8.16b, v12.16b
540 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
541 eor v8.16b, v8.16b, v9.16b
544 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
545 tbl v12.16b, {v30.16b}, v10.16b
546 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
547 tbl v8.16b, {v8.16b},v5.16b
548 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
549 tbl v9.16b, {v31.16b}, v11.16b
550 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
551 eor v8.16b, v8.16b, v12.16b
552 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
553 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
554 eor v8.16b, v8.16b, v9.16b
559 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
560 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
561 and v9.16b, v8.16b, v17.16b
562 ushr v8.16b, v8.16b, #4
563 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
564 tbl v10.16b, {v19.16b},v9.16b
565 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
566 eor v9.16b, v9.16b, v8.16b
567 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
568 tbl v11.16b, {v18.16b},v8.16b
569 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
570 tbl v12.16b, {v18.16b},v9.16b
571 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
572 eor v11.16b, v11.16b, v10.16b
573 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
574 eor v12.16b, v12.16b, v10.16b
575 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
576 tbl v10.16b, {v18.16b},v11.16b
577 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
578 tbl v11.16b, {v18.16b},v12.16b
579 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
580 eor v10.16b, v10.16b, v9.16b
581 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
582 eor v11.16b, v11.16b, v8.16b
588 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
589 tbl v12.16b, {v22.16b}, v10.16b
591 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
592 tbl v9.16b, {v23.16b}, v11.16b
594 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
595 eor v12.16b, v12.16b, v16.16b
596 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
597 eor v8.16b, v9.16b, v12.16b
598 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
599 tbl v1.16b, {v8.16b},v2.16b
612 movi v16.16b, #0x5b // .Lk_s63
615 movi v17.16b, #0x0f // .Lk_s0F
637 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
640 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
642 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
652 b .Lschedule_go
657 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
663 b.hi .Lschedule_256
664 b.eq .Lschedule_192
683 b .Loop_schedule_128
703 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
705 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
706 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
713 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
721 b .Loop_schedule_192
735 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
742 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
751 movi v4.16b, #0
752 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
753 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
755 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
757 b .Loop_schedule_256
782 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
787 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
792 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
793 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
794 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
795 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
796 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
797 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
798 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
799 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
810 ## %xmm7: high side, b a x y
815 ## %xmm6: b+c+d b+c 0 0
816 ## %xmm0: b+c+d b+c b a
821 movi v1.16b, #0
825 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
826 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
827 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
828 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
855 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
856 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
857 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
858 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
862 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
869 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
870 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
871 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
874 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
875 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
876 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
877 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
878 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
879 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
880 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
881 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
882 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
883 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
884 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
885 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
886 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
887 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
888 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
889 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
890 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
893 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
894 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
910 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
911 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
913 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
915 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
916 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
946 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
951 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
953 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
954 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
955 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
956 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
958 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
960 b .Lschedule_mangle_both
965 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
966 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
969 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
971 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
972 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
973 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
976 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
977 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
979 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
980 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
981 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
984 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
985 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
987 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
988 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
991 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
992 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
994 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
996 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
997 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1002 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1065 b.eq vpaes_cbc_decrypt
1073 ld1 {v0.16b}, [x4] // load ivec
1075 b .Lcbc_enc_loop
1079 ld1 {v7.16b}, [x0],#16 // load input
1080 eor v7.16b, v7.16b, v0.16b // xor with ivec
1082 st1 {v0.16b}, [x1],#16 // save output
1084 b.hi .Lcbc_enc_loop
1086 st1 {v0.16b}, [x4] // write ivec
1105 ld1 {v6.16b}, [x4] // load ivec
1108 b.eq .Lcbc_dec_loop2x
1110 ld1 {v7.16b}, [x0], #16 // load input
1112 eor v0.16b, v0.16b, v6.16b // xor with ivec
1113 orr v6.16b, v7.16b, v7.16b // next ivec value
1114 st1 {v0.16b}, [x1], #16
1116 b.ls .Lcbc_dec_done
1120 ld1 {v14.16b,v15.16b}, [x0], #32
1122 eor v0.16b, v0.16b, v6.16b // xor with ivec
1123 eor v1.16b, v1.16b, v14.16b
1124 orr v6.16b, v15.16b, v15.16b
1125 st1 {v0.16b,v1.16b}, [x1], #32
1127 b.hi .Lcbc_dec_loop2x
1130 st1 {v6.16b}, [x4]
1160 ld1 {v7.16b}, [x4]
1165 b.eq .Lctr32_prep_loop
1169 ld1 {v6.16b}, [x0], #16 // .Load input ahead of time
1171 eor v0.16b, v0.16b, v6.16b // XOR input and result
1172 st1 {v0.16b}, [x1], #16
1178 b.ls .Lctr32_done
1183 mov v15.16b, v7.16b
1184 mov v14.16b, v7.16b
1190 ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time
1192 eor v0.16b, v0.16b, v6.16b // XOR input and result
1193 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
1194 st1 {v0.16b,v1.16b}, [x1], #32
1203 b.hi .Lctr32_loop