Lines Matching full:16

125 	movi	v17.16b, #0x0f
154 adrp x11, Lk_mc_forward+16
155 add x11, x11, :lo12:Lk_mc_forward+16
157 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
158 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
159 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
160 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
161 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
162 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
163 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
164 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
171 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
172 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
173 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
174 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
175 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
176 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
177 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
179 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
180 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
181 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
182 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
183 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
184 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
186 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
191 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
192 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
193 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
194 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
195 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
196 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
197 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
198 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
199 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
200 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
201 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
202 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
203 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
209 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
210 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
212 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
213 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
214 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
215 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
227 stp x29,x30,[sp,#-16]!
230 ld1 {v7.16b}, [x0]
233 st1 {v0.16b}, [x1]
235 ldp x29,x30,[sp],#16
247 adrp x11, Lk_mc_forward+16
248 add x11, x11, :lo12:Lk_mc_forward+16
250 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
251 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
252 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
253 and v9.16b, v15.16b, v17.16b
254 ushr v8.16b, v15.16b, #4
255 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
256 tbl v9.16b, {v20.16b}, v9.16b
257 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
258 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
259 tbl v10.16b, {v21.16b}, v8.16b
260 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
261 eor v8.16b, v9.16b, v16.16b
262 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
263 eor v8.16b, v8.16b, v10.16b
270 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
271 tbl v12.16b, {v25.16b}, v10.16b
272 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[]
273 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
274 tbl v8.16b, {v24.16b}, v11.16b
275 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
276 eor v12.16b, v12.16b, v16.16b
277 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
278 tbl v13.16b, {v27.16b}, v10.16b
279 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
280 eor v8.16b, v8.16b, v12.16b
281 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
282 tbl v10.16b, {v26.16b}, v11.16b
284 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
285 tbl v11.16b, {v8.16b}, v1.16b
286 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
287 eor v10.16b, v10.16b, v13.16b
288 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
289 tbl v8.16b, {v8.16b}, v4.16b
290 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
291 eor v11.16b, v11.16b, v10.16b
292 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
293 tbl v12.16b, {v11.16b},v1.16b
294 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
295 eor v8.16b, v8.16b, v11.16b
297 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
298 eor v8.16b, v8.16b, v12.16b
303 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
304 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
305 and v9.16b, v8.16b, v17.16b
306 ushr v8.16b, v8.16b, #4
307 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
308 tbl v13.16b, {v19.16b},v9.16b
309 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
310 eor v9.16b, v9.16b, v8.16b
311 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
312 tbl v11.16b, {v18.16b},v8.16b
313 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
314 tbl v12.16b, {v18.16b},v9.16b
315 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
316 eor v11.16b, v11.16b, v13.16b
317 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
318 eor v12.16b, v12.16b, v13.16b
319 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
320 tbl v10.16b, {v18.16b},v11.16b
321 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
322 tbl v11.16b, {v18.16b},v12.16b
323 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
324 eor v10.16b, v10.16b, v9.16b
325 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
326 eor v11.16b, v11.16b, v8.16b
327 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
333 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
334 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
335 tbl v12.16b, {v22.16b}, v10.16b
337 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
338 tbl v8.16b, {v23.16b}, v11.16b
339 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
340 eor v12.16b, v12.16b, v16.16b
341 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
342 eor v8.16b, v8.16b, v12.16b
343 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
344 tbl v1.16b, {v8.16b},v1.16b
355 movi v17.16b, #0x0f
388 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
389 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
390 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0
391 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
393 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
394 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
395 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
396 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
406 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
407 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
408 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
410 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
413 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
414 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
415 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
416 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
418 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
421 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
422 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
423 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
424 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
426 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
429 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
430 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
431 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
432 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
433 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
434 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
439 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
440 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
441 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
442 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
443 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
444 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
445 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
446 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
447 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
448 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
449 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
450 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
451 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
456 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
459 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
460 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
461 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
462 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
474 stp x29,x30,[sp,#-16]!
477 ld1 {v7.16b}, [x0]
480 st1 {v0.16b}, [x1]
482 ldp x29,x30,[sp],#16
506 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
507 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
508 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0
509 and v9.16b, v15.16b, v17.16b
510 ushr v8.16b, v15.16b, #4
511 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
512 tbl v10.16b, {v20.16b},v9.16b
514 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
515 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
516 tbl v8.16b, {v21.16b},v8.16b
517 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
518 eor v10.16b, v10.16b, v16.16b
519 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
520 eor v8.16b, v8.16b, v10.16b
530 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
531 tbl v12.16b, {v24.16b}, v10.16b
532 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
533 tbl v9.16b, {v25.16b}, v11.16b
534 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
535 eor v8.16b, v12.16b, v16.16b
537 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
538 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
541 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
542 tbl v12.16b, {v26.16b}, v10.16b
543 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
544 tbl v8.16b, {v8.16b},v5.16b
545 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
546 tbl v9.16b, {v27.16b}, v11.16b
547 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
548 eor v8.16b, v8.16b, v12.16b
550 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
551 eor v8.16b, v8.16b, v9.16b
554 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
555 tbl v12.16b, {v28.16b}, v10.16b
556 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
557 tbl v8.16b, {v8.16b},v5.16b
558 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
559 tbl v9.16b, {v29.16b}, v11.16b
560 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
561 eor v8.16b, v8.16b, v12.16b
563 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
564 eor v8.16b, v8.16b, v9.16b
567 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
568 tbl v12.16b, {v30.16b}, v10.16b
569 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
570 tbl v8.16b, {v8.16b},v5.16b
571 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
572 tbl v9.16b, {v31.16b}, v11.16b
573 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
574 eor v8.16b, v8.16b, v12.16b
575 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5
576 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
577 eor v8.16b, v8.16b, v9.16b
582 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
583 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
584 and v9.16b, v8.16b, v17.16b
585 ushr v8.16b, v8.16b, #4
586 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
587 tbl v10.16b, {v19.16b},v9.16b
588 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
589 eor v9.16b, v9.16b, v8.16b
590 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
591 tbl v11.16b, {v18.16b},v8.16b
592 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
593 tbl v12.16b, {v18.16b},v9.16b
594 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
595 eor v11.16b, v11.16b, v10.16b
596 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
597 eor v12.16b, v12.16b, v10.16b
598 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
599 tbl v10.16b, {v18.16b},v11.16b
600 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
601 tbl v11.16b, {v18.16b},v12.16b
602 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
603 eor v10.16b, v10.16b, v9.16b
604 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
605 eor v11.16b, v11.16b, v8.16b
606 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
611 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
612 tbl v12.16b, {v22.16b}, v10.16b
614 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
615 tbl v9.16b, {v23.16b}, v11.16b
617 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
618 eor v12.16b, v12.16b, v16.16b
619 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
620 eor v8.16b, v9.16b, v12.16b
621 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
622 tbl v1.16b, {v8.16b},v2.16b
637 movi v16.16b, #0x5b // Lk_s63
640 movi v17.16b, #0x0f // Lk_s0F
660 stp x29, x30, [sp,#-16]!
665 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
668 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
670 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
685 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
731 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
733 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
734 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
741 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0
763 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
770 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
779 movi v4.16b, #0
780 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
781 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
783 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
810 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
814 sub x2, x2, #16 // add $-16, %rdx
815 eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0
820 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
821 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
822 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
823 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
824 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
825 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
826 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
827 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
828 ldp x29, x30, [sp],#16
852 movi v1.16b, #0
856 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
857 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
858 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
859 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
888 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
889 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1
890 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8
891 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
895 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0
902 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1
903 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
904 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4
907 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
908 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i
909 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
910 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
911 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
912 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
913 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
914 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
915 eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7
916 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
917 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
918 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
919 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
920 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
921 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
922 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
923 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
926 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
927 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
945 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
946 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0
948 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
949 // vmovdqa 16(%r11), %xmm1 # hi
950 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
951 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
983 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
988 eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4
989 add x2, x2, #16 // add $16, %rdx
990 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
991 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
992 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
993 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
995 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
1002 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi
1003 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1006 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1008 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1009 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1010 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1013 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1014 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1016 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1017 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1018 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1021 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1022 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1024 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1025 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1028 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1029 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1031 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1033 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1034 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1036 sub x2, x2, #16 // add $-16, %rdx
1039 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1040 add x8, x8, #48 // add $-16, %r8
1054 stp x29,x30,[sp,#-16]!
1056 stp d8,d9,[sp,#-16]! // ABI spec says so
1067 ldp d8,d9,[sp],#16
1068 ldp x29,x30,[sp],#16
1081 stp x29,x30,[sp,#-16]!
1083 stp d8,d9,[sp,#-16]! // ABI spec says so
1089 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx
1098 ldp d8,d9,[sp],#16
1099 ldp x29,x30,[sp],#16
1115 stp x29,x30,[sp,#-16]!
1121 ld1 {v0.16b}, [x4] // load ivec
1127 ld1 {v7.16b}, [x0],#16 // load input
1128 eor v7.16b, v7.16b, v0.16b // xor with ivec
1130 st1 {v0.16b}, [x1],#16 // save output
1131 subs x17, x17, #16
1134 st1 {v0.16b}, [x4] // write ivec
1136 ldp x29,x30,[sp],#16
1149 stp x29,x30,[sp,#-16]!
1151 stp d8,d9,[sp,#-16]! // ABI spec says so
1152 stp d10,d11,[sp,#-16]!
1153 stp d12,d13,[sp,#-16]!
1154 stp d14,d15,[sp,#-16]!
1158 ld1 {v6.16b}, [x4] // load ivec
1160 tst x17, #16
1163 ld1 {v7.16b}, [x0], #16 // load input
1165 eor v0.16b, v0.16b, v6.16b // xor with ivec
1166 orr v6.16b, v7.16b, v7.16b // next ivec value
1167 st1 {v0.16b}, [x1], #16
1168 subs x17, x17, #16
1173 ld1 {v14.16b,v15.16b}, [x0], #32
1175 eor v0.16b, v0.16b, v6.16b // xor with ivec
1176 eor v1.16b, v1.16b, v14.16b
1177 orr v6.16b, v15.16b, v15.16b
1178 st1 {v0.16b,v1.16b}, [x1], #32
1183 st1 {v6.16b}, [x4]
1185 ldp d14,d15,[sp],#16
1186 ldp d12,d13,[sp],#16
1187 ldp d10,d11,[sp],#16
1188 ldp d8,d9,[sp],#16
1189 ldp x29,x30,[sp],#16
1201 stp x29,x30,[sp,#-16]!
1203 stp d8,d9,[sp,#-16]! // ABI spec says so
1204 stp d10,d11,[sp,#-16]!
1205 stp d12,d13,[sp,#-16]!
1206 stp d14,d15,[sp,#-16]!
1217 ld1 {v7.16b}, [x4]
1226 ld1 {v6.16b}, [x0], #16 // Load input ahead of time
1228 eor v0.16b, v0.16b, v6.16b // XOR input and result
1229 st1 {v0.16b}, [x1], #16
1240 mov v15.16b, v7.16b
1241 mov v14.16b, v7.16b
1247 ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time
1249 eor v0.16b, v0.16b, v6.16b // XOR input and result
1250 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2)
1251 st1 {v0.16b,v1.16b}, [x1], #32
1263 ldp d14,d15,[sp],#16
1264 ldp d12,d13,[sp],#16
1265 ldp d10,d11,[sp],#16
1266 ldp d8,d9,[sp],#16
1267 ldp x29,x30,[sp],#16