/external/libaom/libaom/aom_dsp/x86/ |
D | highbd_subpel_variance_impl_sse2.asm | 313 ; slightly faster because of pmullw latency. It would also cut our rodata 315 pmullw m1, filter_y_a 316 pmullw m5, filter_y_b 318 pmullw m0, filter_y_a 319 pmullw m4, filter_y_b 343 pmullw m1, filter_y_a 344 pmullw m5, filter_y_b 346 pmullw m0, filter_y_a 347 pmullw m4, filter_y_b 546 pmullw m1, filter_y_a [all …]
|
D | subpel_variance_sse2.asm | 408 ; slightly faster because of pmullw latency. It would also cut our rodata 410 pmullw m2, filter_y_a 411 pmullw m3, filter_y_b 413 pmullw m0, filter_y_a 414 pmullw m4, filter_y_b 452 pmullw m0, filter_y_a 453 pmullw m1, m2, filter_y_b 456 pmullw m2, filter_y_a 457 pmullw m4, filter_y_b 886 pmullw m2, filter_x_a [all …]
|
D | aom_subpixel_8t_sse2.asm | 64 pmullw xmm0, k0k1 ;multiply the filter factors 65 pmullw xmm6, k6k7 66 pmullw xmm2, k2k3 67 pmullw xmm5, k5k4 155 pmullw xmm0, k0 156 pmullw xmm1, k1 157 pmullw xmm6, k6 158 pmullw xmm7, k7 159 pmullw xmm2, k2 160 pmullw xmm5, k5 [all …]
|
D | aom_subpixel_bilinear_sse2.asm | 42 pmullw xmm0, xmm4 ;multiply the filter factors 90 pmullw xmm0, xmm6 91 pmullw xmm1, xmm7 113 pmullw xmm0, xmm6 114 pmullw xmm1, xmm7 115 pmullw xmm2, xmm6 116 pmullw xmm3, xmm7
|
D | quantize_ssse3_x86_64.asm | 79 pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh 89 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh 125 pmullw m8, m3 ; dqc[i] = qc[i] * q 127 pmullw m13, m3 ; dqc[i] = qc[i] * q 187 pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh 194 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh 231 pmullw m14, m3 ; dqc[i] = qc[i] * q 232 pmullw m13, m3 ; dqc[i] = qc[i] * q
|
D | quantize_avx_x86_64.asm | 125 pmullw m8, m3 ; dqc[i] = qc[i] * q 127 pmullw m13, m3 ; dqc[i] = qc[i] * q 256 pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh 266 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh 296 pmullw m8, m3 ; dqc[i] = qc[i] * q 298 pmullw m13, m3 ; dqc[i] = qc[i] * q 377 pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh 384 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh 414 pmullw m14, m3 ; dqc[i] = qc[i] * q 415 pmullw m13, m3 ; dqc[i] = qc[i] * q
|
/external/libvpx/libvpx/vpx_dsp/x86/ |
D | highbd_subpel_variance_impl_sse2.asm | 310 ; slightly faster because of pmullw latency. It would also cut our rodata 312 pmullw m1, filter_y_a 313 pmullw m5, filter_y_b 315 pmullw m0, filter_y_a 316 pmullw m4, filter_y_b 340 pmullw m1, filter_y_a 341 pmullw m5, filter_y_b 343 pmullw m0, filter_y_a 344 pmullw m4, filter_y_b 543 pmullw m1, filter_y_a [all …]
|
D | subpel_variance_sse2.asm | 405 ; slightly faster because of pmullw latency. It would also cut our rodata 407 pmullw m2, filter_y_a 408 pmullw m3, filter_y_b 410 pmullw m0, filter_y_a 411 pmullw m4, filter_y_b 449 pmullw m0, filter_y_a 450 pmullw m1, m2, filter_y_b 453 pmullw m2, filter_y_a 454 pmullw m4, filter_y_b 883 pmullw m2, filter_x_a [all …]
|
D | vpx_subpixel_bilinear_sse2.asm | 39 pmullw xmm0, xmm4 ;multiply the filter factors 87 pmullw xmm0, xmm6 88 pmullw xmm1, xmm7 110 pmullw xmm0, xmm6 111 pmullw xmm1, xmm7 112 pmullw xmm2, xmm6 113 pmullw xmm3, xmm7
|
D | vpx_subpixel_8t_sse2.asm | 61 pmullw xmm0, k0k1 ;multiply the filter factors 62 pmullw xmm6, k6k7 63 pmullw xmm2, k2k3 64 pmullw xmm5, k5k4 152 pmullw xmm0, k0 153 pmullw xmm1, k1 154 pmullw xmm6, k6 155 pmullw xmm7, k7 156 pmullw xmm2, k2 157 pmullw xmm5, k5 [all …]
|
/external/libvpx/libvpx/vp8/common/x86/ |
D | subpixel_sse2.asm | 77 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 81 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 87 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 92 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 98 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 101 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 206 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 210 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 216 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 221 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 [all …]
|
D | subpixel_mmx.asm | 58 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 62 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers 68 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers 74 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers 80 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers 84 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers 157 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 161 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 165 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 169 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. [all …]
|
D | dequantize_mmx.asm | 31 pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. 35 pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. 39 pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. 43 pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. 73 pmullw mm0, [rdx] 76 pmullw mm1, [rdx +8] 79 pmullw mm2, [rdx+16] 82 pmullw mm3, [rdx+24]
|
D | mfqe_sse2.asm | 59 pmullw xmm2, xmm0 60 pmullw xmm3, xmm0 66 pmullw xmm4, xmm1 67 pmullw xmm5, xmm1 134 pmullw xmm2, xmm0 138 pmullw xmm3, xmm1
|
D | idctllm_sse2.asm | 41 pmullw xmm4, xmm5 142 pmullw xmm0, [rdx] 143 pmullw xmm2, [rdx+16] 144 pmullw xmm1, [rdx] 145 pmullw xmm3, [rdx+16] 474 pmullw xmm0, [rdx] 475 pmullw xmm2, [rdx+16] 476 pmullw xmm1, [rdx] 477 pmullw xmm3, [rdx+16]
|
/external/llvm/test/CodeGen/X86/ |
D | pmul.ll | 16 ; SSE2-NEXT: pmullw %xmm1, %xmm2 21 ; SSE2-NEXT: pmullw %xmm1, %xmm0 30 ; SSE41-NEXT: pmullw %xmm2, %xmm1 35 ; SSE41-NEXT: pmullw %xmm2, %xmm0 79 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 153 ; SSE2-NEXT: pmullw %xmm2, %xmm3 160 ; SSE2-NEXT: pmullw %xmm1, %xmm0 169 ; SSE41-NEXT: pmullw %xmm3, %xmm2 176 ; SSE41-NEXT: pmullw %xmm1, %xmm0 220 ; SSE-NEXT: pmullw %xmm1, %xmm0 [all …]
|
D | vec_shift6.ll | 8 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead. 13 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 32 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 91 ; into two pmullw instructions. With AVX2, the test case below would produce 98 ; SSE-NEXT: pmullw %xmm2, %xmm0 99 ; SSE-NEXT: pmullw %xmm2, %xmm1 142 ; parts and then we convert each part into a pmullw. 148 ; SSE-NEXT: pmullw %xmm4, %xmm0 149 ; SSE-NEXT: pmullw %xmm4, %xmm1 150 ; SSE-NEXT: pmullw %xmm4, %xmm2 [all …]
|
/external/swiftshader/third_party/llvm-7.0/llvm/test/Transforms/LoopVectorize/X86/ |
D | mul_slm_16bit.ll | 34 ; use pmullw\sext seq. 38 ; use pmulhw\pmullw\pshuf 44 ; use pmullw\zext 50 ; use pmullw\sext 55 ; use pmulhw\pmullw\pshuf 60 ; use pmulhw\pmullw\pshuf 65 ; use pmullw\zext 103 ; use pmulhw\pmullw\pshuf seq. 113 ; use pmulhw\pmullw\zext 119 ; use pmulhw\pmullw\sext [all …]
|
/external/swiftshader/third_party/llvm-7.0/llvm/test/CodeGen/X86/ |
D | pmul.ll | 14 ; SSE2-NEXT: pmullw %xmm2, %xmm1 18 ; SSE2-NEXT: pmullw %xmm2, %xmm0 28 ; SSE41-NEXT: pmullw %xmm2, %xmm0 31 ; SSE41-NEXT: pmullw %xmm2, %xmm1 74 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 146 ; SSE2-NEXT: pmullw %xmm2, %xmm3 151 ; SSE2-NEXT: pmullw %xmm1, %xmm0 162 ; SSE41-NEXT: pmullw %xmm1, %xmm0 165 ; SSE41-NEXT: pmullw %xmm3, %xmm2 211 ; SSE-NEXT: pmullw %xmm1, %xmm0 [all …]
|
D | vec_shift6.ll | 9 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead. 14 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 28 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 93 ; into two pmullw instructions. With AVX2, the test case below would produce 100 ; SSE-NEXT: pmullw %xmm2, %xmm0 101 ; SSE-NEXT: pmullw %xmm2, %xmm1 153 ; parts and then we convert each part into a pmullw. 159 ; SSE-NEXT: pmullw %xmm4, %xmm0 160 ; SSE-NEXT: pmullw %xmm4, %xmm1 161 ; SSE-NEXT: pmullw %xmm4, %xmm2 [all …]
|
D | vector-reduce-mul.ll | 1133 ; SSE-NEXT: pmullw %xmm0, %xmm1 1135 ; SSE-NEXT: pmullw %xmm1, %xmm0 1138 ; SSE-NEXT: pmullw %xmm0, %xmm1 1173 ; SSE-NEXT: pmullw %xmm1, %xmm0 1175 ; SSE-NEXT: pmullw %xmm0, %xmm1 1177 ; SSE-NEXT: pmullw %xmm1, %xmm0 1180 ; SSE-NEXT: pmullw %xmm0, %xmm1 1236 ; SSE-NEXT: pmullw %xmm3, %xmm1 1237 ; SSE-NEXT: pmullw %xmm2, %xmm0 1238 ; SSE-NEXT: pmullw %xmm1, %xmm0 [all …]
|
D | vector-mul.ll | 156 ; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 161 ; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 184 ; X86-NEXT: pmullw %xmm2, %xmm0 187 ; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm1 199 ; X64-NEXT: pmullw %xmm2, %xmm0 202 ; X64-NEXT: pmullw {{.*}}(%rip), %xmm1 296 ; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0 301 ; X64-NEXT: pmullw {{.*}}(%rip), %xmm0 318 ; X86-NEXT: pmullw %xmm2, %xmm0 321 ; X86-NEXT: pmullw %xmm2, %xmm1 [all …]
|
/external/libvpx/libvpx/vp9/common/x86/ |
D | vp9_mfqe_sse2.asm | 60 pmullw xmm2, xmm0 61 pmullw xmm3, xmm0 67 pmullw xmm4, xmm1 68 pmullw xmm5, xmm1 135 pmullw xmm2, xmm0 139 pmullw xmm3, xmm1
|
/external/mesa3d/src/mesa/x86/ |
D | read_rgba_span_x86.S | 553 pmullw %mm6, %mm0 554 pmullw %mm6, %mm2 588 pmullw %mm6, %mm0 589 pmullw %mm6, %mm2 626 pmullw %mm6, %mm0 627 pmullw %mm6, %mm2 657 pmullw %mm6, %mm0
|
/external/libvpx/libvpx/vp9/encoder/x86/ |
D | vp9_quantize_ssse3_x86_64.asm | 76 pmullw m8, m3 ; r4[i] = r3[i] * q 78 pmullw m13, m3 ; r4[i] = r3[i] * q 130 pmullw m14, m3 ; r4[i] = r3[i] * q 131 pmullw m13, m3 ; r4[i] = r3[i] * q
|