Lines Matching refs:b

119 	movi	v17.16b, #0x0f
150 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
151 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
152 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
154 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
155 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
156 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
157 b Lenc_entry
163 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
165 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
166 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
167 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
168 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
169 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
171 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
172 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
173 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
174 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
175 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
176 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
178 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
183 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
184 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
185 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
186 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
187 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
188 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
189 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
190 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
191 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
192 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
193 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
194 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
202 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
204 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
205 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
206 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
207 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
219 ld1 {v7.16b}, [x0]
222 st1 {v0.16b}, [x1]
237 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
238 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
239 and v9.16b, v15.16b, v17.16b
240 ushr v8.16b, v15.16b, #4
241 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
242 tbl v9.16b, {v20.16b}, v9.16b
244 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
245 tbl v10.16b, {v21.16b}, v8.16b
246 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
247 eor v8.16b, v9.16b, v16.16b
248 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
249 eor v8.16b, v8.16b, v10.16b
250 b Lenc_2x_entry
256 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
257 tbl v12.16b, {v25.16b}, v10.16b
259 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
260 tbl v8.16b, {v24.16b}, v11.16b
261 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
262 eor v12.16b, v12.16b, v16.16b
263 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
264 tbl v13.16b, {v27.16b}, v10.16b
265 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
266 eor v8.16b, v8.16b, v12.16b
267 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
268 tbl v10.16b, {v26.16b}, v11.16b
270 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
271 tbl v11.16b, {v8.16b}, v1.16b
272 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
273 eor v10.16b, v10.16b, v13.16b
274 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
275 tbl v8.16b, {v8.16b}, v4.16b
276 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
277 eor v11.16b, v11.16b, v10.16b
278 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
279 tbl v12.16b, {v11.16b},v1.16b
280 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
281 eor v8.16b, v8.16b, v11.16b
283 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
284 eor v8.16b, v8.16b, v12.16b
289 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
290 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
291 and v9.16b, v8.16b, v17.16b
292 ushr v8.16b, v8.16b, #4
293 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
294 tbl v13.16b, {v19.16b},v9.16b
295 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
296 eor v9.16b, v9.16b, v8.16b
297 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
298 tbl v11.16b, {v18.16b},v8.16b
299 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
300 tbl v12.16b, {v18.16b},v9.16b
301 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
302 eor v11.16b, v11.16b, v13.16b
303 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
304 eor v12.16b, v12.16b, v13.16b
305 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
306 tbl v10.16b, {v18.16b},v11.16b
307 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
308 tbl v11.16b, {v18.16b},v12.16b
309 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
310 eor v10.16b, v10.16b, v9.16b
311 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
312 eor v11.16b, v11.16b, v8.16b
320 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
321 tbl v12.16b, {v22.16b}, v10.16b
323 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
324 tbl v8.16b, {v23.16b}, v11.16b
325 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
326 eor v12.16b, v12.16b, v16.16b
327 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
328 eor v8.16b, v8.16b, v12.16b
329 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
330 tbl v1.16b, {v8.16b},v1.16b
339 movi v17.16b, #0x0f
371 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
372 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
373 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
376 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
377 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
378 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
379 b Ldec_entry
388 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
389 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
390 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
392 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
395 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
396 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
397 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
398 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
400 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
403 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
404 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
405 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
406 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
408 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
411 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
412 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
413 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
414 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
415 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
416 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
421 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
422 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
423 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
424 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
425 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
426 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
427 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
428 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
429 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
430 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
431 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
432 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
438 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
441 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
442 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
443 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
444 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
456 ld1 {v7.16b}, [x0]
459 st1 {v0.16b}, [x1]
483 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
484 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
485 and v9.16b, v15.16b, v17.16b
486 ushr v8.16b, v15.16b, #4
487 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
488 tbl v10.16b, {v20.16b},v9.16b
491 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
492 tbl v8.16b, {v21.16b},v8.16b
493 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
494 eor v10.16b, v10.16b, v16.16b
495 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
496 eor v8.16b, v8.16b, v10.16b
497 b Ldec_2x_entry
506 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
507 tbl v12.16b, {v24.16b}, v10.16b
508 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
509 tbl v9.16b, {v25.16b}, v11.16b
510 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
511 eor v8.16b, v12.16b, v16.16b
513 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
514 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
517 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
518 tbl v12.16b, {v26.16b}, v10.16b
519 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
520 tbl v8.16b, {v8.16b},v5.16b
521 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
522 tbl v9.16b, {v27.16b}, v11.16b
523 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
524 eor v8.16b, v8.16b, v12.16b
526 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
527 eor v8.16b, v8.16b, v9.16b
530 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
531 tbl v12.16b, {v28.16b}, v10.16b
532 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
533 tbl v8.16b, {v8.16b},v5.16b
534 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
535 tbl v9.16b, {v29.16b}, v11.16b
536 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
537 eor v8.16b, v8.16b, v12.16b
539 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
540 eor v8.16b, v8.16b, v9.16b
543 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
544 tbl v12.16b, {v30.16b}, v10.16b
545 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
546 tbl v8.16b, {v8.16b},v5.16b
547 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
548 tbl v9.16b, {v31.16b}, v11.16b
549 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
550 eor v8.16b, v8.16b, v12.16b
551 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
552 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
553 eor v8.16b, v8.16b, v9.16b
558 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
559 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
560 and v9.16b, v8.16b, v17.16b
561 ushr v8.16b, v8.16b, #4
562 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
563 tbl v10.16b, {v19.16b},v9.16b
564 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
565 eor v9.16b, v9.16b, v8.16b
566 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
567 tbl v11.16b, {v18.16b},v8.16b
568 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
569 tbl v12.16b, {v18.16b},v9.16b
570 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
571 eor v11.16b, v11.16b, v10.16b
572 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
573 eor v12.16b, v12.16b, v10.16b
574 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
575 tbl v10.16b, {v18.16b},v11.16b
576 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
577 tbl v11.16b, {v18.16b},v12.16b
578 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
579 eor v10.16b, v10.16b, v9.16b
580 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
581 eor v11.16b, v11.16b, v8.16b
587 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
588 tbl v12.16b, {v22.16b}, v10.16b
590 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
591 tbl v9.16b, {v23.16b}, v11.16b
593 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
594 eor v12.16b, v12.16b, v16.16b
595 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
596 eor v8.16b, v9.16b, v12.16b
597 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
598 tbl v1.16b, {v8.16b},v2.16b
611 movi v16.16b, #0x5b // Lk_s63
614 movi v17.16b, #0x0f // Lk_s0F
636 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
639 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
641 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
651 b Lschedule_go
656 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
662 b.hi Lschedule_256
663 b.eq Lschedule_192
682 b Loop_schedule_128
702 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
704 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
705 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
712 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
720 b Loop_schedule_192
734 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
741 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
750 movi v4.16b, #0
751 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
752 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
754 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
756 b Loop_schedule_256
781 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
786 eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
791 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
792 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
793 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
794 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
795 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
796 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
797 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
798 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
809 ## %xmm7: high side, b a x y
814 ## %xmm6: b+c+d b+c 0 0
815 ## %xmm0: b+c+d b+c b a
820 movi v1.16b, #0
824 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
825 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
826 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
827 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
854 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
855 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
856 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
857 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
861 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
868 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
869 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
870 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
873 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
874 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
875 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
876 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
877 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
878 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
879 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
880 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
881 eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
882 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
883 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
884 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
885 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
886 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
887 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
888 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
889 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
892 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
893 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
909 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
910 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
912 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
914 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
915 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
945 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
950 eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
952 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
953 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
954 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
955 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
957 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
959 b Lschedule_mangle_both
964 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
965 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
968 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
970 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
971 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
972 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
975 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
976 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
978 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
979 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
980 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
983 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
984 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
986 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
987 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
990 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
991 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
993 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
995 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
996 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1001 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1064 b.eq vpaes_cbc_decrypt
1072 ld1 {v0.16b}, [x4] // load ivec
1074 b Lcbc_enc_loop
1078 ld1 {v7.16b}, [x0],#16 // load input
1079 eor v7.16b, v7.16b, v0.16b // xor with ivec
1081 st1 {v0.16b}, [x1],#16 // save output
1083 b.hi Lcbc_enc_loop
1085 st1 {v0.16b}, [x4] // write ivec
1104 ld1 {v6.16b}, [x4] // load ivec
1107 b.eq Lcbc_dec_loop2x
1109 ld1 {v7.16b}, [x0], #16 // load input
1111 eor v0.16b, v0.16b, v6.16b // xor with ivec
1112 orr v6.16b, v7.16b, v7.16b // next ivec value
1113 st1 {v0.16b}, [x1], #16
1115 b.ls Lcbc_dec_done
1119 ld1 {v14.16b,v15.16b}, [x0], #32
1121 eor v0.16b, v0.16b, v6.16b // xor with ivec
1122 eor v1.16b, v1.16b, v14.16b
1123 orr v6.16b, v15.16b, v15.16b
1124 st1 {v0.16b,v1.16b}, [x1], #32
1126 b.hi Lcbc_dec_loop2x
1129 st1 {v6.16b}, [x4]
1159 ld1 {v7.16b}, [x4]
1164 b.eq Lctr32_prep_loop
1168 ld1 {v6.16b}, [x0], #16 // Load input ahead of time
1170 eor v0.16b, v0.16b, v6.16b // XOR input and result
1171 st1 {v0.16b}, [x1], #16
1177 b.ls Lctr32_done
1182 mov v15.16b, v7.16b
1183 mov v14.16b, v7.16b
1189 ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
1191 eor v0.16b, v0.16b, v6.16b // XOR input and result
1192 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
1193 st1 {v0.16b,v1.16b}, [x1], #32
1202 b.hi Lctr32_loop