Home
last modified time | relevance | path

Searched refs:pmullw (Results 1 – 25 of 124) sorted by relevance

12345

/external/libaom/libaom/aom_dsp/x86/
Dhighbd_subpel_variance_impl_sse2.asm313 ; slightly faster because of pmullw latency. It would also cut our rodata
315 pmullw m1, filter_y_a
316 pmullw m5, filter_y_b
318 pmullw m0, filter_y_a
319 pmullw m4, filter_y_b
343 pmullw m1, filter_y_a
344 pmullw m5, filter_y_b
346 pmullw m0, filter_y_a
347 pmullw m4, filter_y_b
546 pmullw m1, filter_y_a
[all …]
Dsubpel_variance_sse2.asm408 ; slightly faster because of pmullw latency. It would also cut our rodata
410 pmullw m2, filter_y_a
411 pmullw m3, filter_y_b
413 pmullw m0, filter_y_a
414 pmullw m4, filter_y_b
452 pmullw m0, filter_y_a
453 pmullw m1, m2, filter_y_b
456 pmullw m2, filter_y_a
457 pmullw m4, filter_y_b
886 pmullw m2, filter_x_a
[all …]
Daom_subpixel_8t_sse2.asm64 pmullw xmm0, k0k1 ;multiply the filter factors
65 pmullw xmm6, k6k7
66 pmullw xmm2, k2k3
67 pmullw xmm5, k5k4
155 pmullw xmm0, k0
156 pmullw xmm1, k1
157 pmullw xmm6, k6
158 pmullw xmm7, k7
159 pmullw xmm2, k2
160 pmullw xmm5, k5
[all …]
Daom_subpixel_bilinear_sse2.asm42 pmullw xmm0, xmm4 ;multiply the filter factors
90 pmullw xmm0, xmm6
91 pmullw xmm1, xmm7
113 pmullw xmm0, xmm6
114 pmullw xmm1, xmm7
115 pmullw xmm2, xmm6
116 pmullw xmm3, xmm7
Dquantize_ssse3_x86_64.asm79 pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh
89 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
125 pmullw m8, m3 ; dqc[i] = qc[i] * q
127 pmullw m13, m3 ; dqc[i] = qc[i] * q
187 pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh
194 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
231 pmullw m14, m3 ; dqc[i] = qc[i] * q
232 pmullw m13, m3 ; dqc[i] = qc[i] * q
Dquantize_avx_x86_64.asm125 pmullw m8, m3 ; dqc[i] = qc[i] * q
127 pmullw m13, m3 ; dqc[i] = qc[i] * q
256 pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh
266 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
296 pmullw m8, m3 ; dqc[i] = qc[i] * q
298 pmullw m13, m3 ; dqc[i] = qc[i] * q
377 pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh
384 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
414 pmullw m14, m3 ; dqc[i] = qc[i] * q
415 pmullw m13, m3 ; dqc[i] = qc[i] * q
/external/libvpx/libvpx/vpx_dsp/x86/
Dhighbd_subpel_variance_impl_sse2.asm310 ; slightly faster because of pmullw latency. It would also cut our rodata
312 pmullw m1, filter_y_a
313 pmullw m5, filter_y_b
315 pmullw m0, filter_y_a
316 pmullw m4, filter_y_b
340 pmullw m1, filter_y_a
341 pmullw m5, filter_y_b
343 pmullw m0, filter_y_a
344 pmullw m4, filter_y_b
543 pmullw m1, filter_y_a
[all …]
Dsubpel_variance_sse2.asm405 ; slightly faster because of pmullw latency. It would also cut our rodata
407 pmullw m2, filter_y_a
408 pmullw m3, filter_y_b
410 pmullw m0, filter_y_a
411 pmullw m4, filter_y_b
449 pmullw m0, filter_y_a
450 pmullw m1, m2, filter_y_b
453 pmullw m2, filter_y_a
454 pmullw m4, filter_y_b
883 pmullw m2, filter_x_a
[all …]
Dvpx_subpixel_bilinear_sse2.asm39 pmullw xmm0, xmm4 ;multiply the filter factors
87 pmullw xmm0, xmm6
88 pmullw xmm1, xmm7
110 pmullw xmm0, xmm6
111 pmullw xmm1, xmm7
112 pmullw xmm2, xmm6
113 pmullw xmm3, xmm7
Dvpx_subpixel_8t_sse2.asm61 pmullw xmm0, k0k1 ;multiply the filter factors
62 pmullw xmm6, k6k7
63 pmullw xmm2, k2k3
64 pmullw xmm5, k5k4
152 pmullw xmm0, k0
153 pmullw xmm1, k1
154 pmullw xmm6, k6
155 pmullw xmm7, k7
156 pmullw xmm2, k2
157 pmullw xmm5, k5
[all …]
/external/libvpx/libvpx/vp8/common/x86/
Dsubpixel_sse2.asm77 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
81 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
87 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
92 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
98 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
101 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
206 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
210 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
216 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
221 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
[all …]
Dsubpixel_mmx.asm58 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
62 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
68 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
74 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
80 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
84 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
157 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
161 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
165 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
169 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
[all …]
Ddequantize_mmx.asm31 pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers.
35 pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers.
39 pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers.
43 pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers.
73 pmullw mm0, [rdx]
76 pmullw mm1, [rdx +8]
79 pmullw mm2, [rdx+16]
82 pmullw mm3, [rdx+24]
Dmfqe_sse2.asm59 pmullw xmm2, xmm0
60 pmullw xmm3, xmm0
66 pmullw xmm4, xmm1
67 pmullw xmm5, xmm1
134 pmullw xmm2, xmm0
138 pmullw xmm3, xmm1
Didctllm_sse2.asm41 pmullw xmm4, xmm5
142 pmullw xmm0, [rdx]
143 pmullw xmm2, [rdx+16]
144 pmullw xmm1, [rdx]
145 pmullw xmm3, [rdx+16]
474 pmullw xmm0, [rdx]
475 pmullw xmm2, [rdx+16]
476 pmullw xmm1, [rdx]
477 pmullw xmm3, [rdx+16]
/external/llvm/test/CodeGen/X86/
Dpmul.ll16 ; SSE2-NEXT: pmullw %xmm1, %xmm2
21 ; SSE2-NEXT: pmullw %xmm1, %xmm0
30 ; SSE41-NEXT: pmullw %xmm2, %xmm1
35 ; SSE41-NEXT: pmullw %xmm2, %xmm0
79 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
153 ; SSE2-NEXT: pmullw %xmm2, %xmm3
160 ; SSE2-NEXT: pmullw %xmm1, %xmm0
169 ; SSE41-NEXT: pmullw %xmm3, %xmm2
176 ; SSE41-NEXT: pmullw %xmm1, %xmm0
220 ; SSE-NEXT: pmullw %xmm1, %xmm0
[all …]
Dvec_shift6.ll8 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
13 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
32 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
91 ; into two pmullw instructions. With AVX2, the test case below would produce
98 ; SSE-NEXT: pmullw %xmm2, %xmm0
99 ; SSE-NEXT: pmullw %xmm2, %xmm1
142 ; parts and then we convert each part into a pmullw.
148 ; SSE-NEXT: pmullw %xmm4, %xmm0
149 ; SSE-NEXT: pmullw %xmm4, %xmm1
150 ; SSE-NEXT: pmullw %xmm4, %xmm2
[all …]
/external/swiftshader/third_party/llvm-7.0/llvm/test/Transforms/LoopVectorize/X86/
Dmul_slm_16bit.ll34 ; use pmullw\sext seq.
38 ; use pmulhw\pmullw\pshuf
44 ; use pmullw\zext
50 ; use pmullw\sext
55 ; use pmulhw\pmullw\pshuf
60 ; use pmulhw\pmullw\pshuf
65 ; use pmullw\zext
103 ; use pmulhw\pmullw\pshuf seq.
113 ; use pmulhw\pmullw\zext
119 ; use pmulhw\pmullw\sext
[all …]
/external/swiftshader/third_party/llvm-7.0/llvm/test/CodeGen/X86/
Dpmul.ll14 ; SSE2-NEXT: pmullw %xmm2, %xmm1
18 ; SSE2-NEXT: pmullw %xmm2, %xmm0
28 ; SSE41-NEXT: pmullw %xmm2, %xmm0
31 ; SSE41-NEXT: pmullw %xmm2, %xmm1
74 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
146 ; SSE2-NEXT: pmullw %xmm2, %xmm3
151 ; SSE2-NEXT: pmullw %xmm1, %xmm0
162 ; SSE41-NEXT: pmullw %xmm1, %xmm0
165 ; SSE41-NEXT: pmullw %xmm3, %xmm2
211 ; SSE-NEXT: pmullw %xmm1, %xmm0
[all …]
Dvec_shift6.ll9 ; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
14 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
28 ; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
93 ; into two pmullw instructions. With AVX2, the test case below would produce
100 ; SSE-NEXT: pmullw %xmm2, %xmm0
101 ; SSE-NEXT: pmullw %xmm2, %xmm1
153 ; parts and then we convert each part into a pmullw.
159 ; SSE-NEXT: pmullw %xmm4, %xmm0
160 ; SSE-NEXT: pmullw %xmm4, %xmm1
161 ; SSE-NEXT: pmullw %xmm4, %xmm2
[all …]
Dvector-reduce-mul.ll1133 ; SSE-NEXT: pmullw %xmm0, %xmm1
1135 ; SSE-NEXT: pmullw %xmm1, %xmm0
1138 ; SSE-NEXT: pmullw %xmm0, %xmm1
1173 ; SSE-NEXT: pmullw %xmm1, %xmm0
1175 ; SSE-NEXT: pmullw %xmm0, %xmm1
1177 ; SSE-NEXT: pmullw %xmm1, %xmm0
1180 ; SSE-NEXT: pmullw %xmm0, %xmm1
1236 ; SSE-NEXT: pmullw %xmm3, %xmm1
1237 ; SSE-NEXT: pmullw %xmm2, %xmm0
1238 ; SSE-NEXT: pmullw %xmm1, %xmm0
[all …]
Dvector-mul.ll156 ; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
161 ; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
184 ; X86-NEXT: pmullw %xmm2, %xmm0
187 ; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm1
199 ; X64-NEXT: pmullw %xmm2, %xmm0
202 ; X64-NEXT: pmullw {{.*}}(%rip), %xmm1
296 ; X86-NEXT: pmullw {{\.LCPI.*}}, %xmm0
301 ; X64-NEXT: pmullw {{.*}}(%rip), %xmm0
318 ; X86-NEXT: pmullw %xmm2, %xmm0
321 ; X86-NEXT: pmullw %xmm2, %xmm1
[all …]
/external/libvpx/libvpx/vp9/common/x86/
Dvp9_mfqe_sse2.asm60 pmullw xmm2, xmm0
61 pmullw xmm3, xmm0
67 pmullw xmm4, xmm1
68 pmullw xmm5, xmm1
135 pmullw xmm2, xmm0
139 pmullw xmm3, xmm1
/external/mesa3d/src/mesa/x86/
Dread_rgba_span_x86.S553 pmullw %mm6, %mm0
554 pmullw %mm6, %mm2
588 pmullw %mm6, %mm0
589 pmullw %mm6, %mm2
626 pmullw %mm6, %mm0
627 pmullw %mm6, %mm2
657 pmullw %mm6, %mm0
/external/libvpx/libvpx/vp9/encoder/x86/
Dvp9_quantize_ssse3_x86_64.asm76 pmullw m8, m3 ; r4[i] = r3[i] * q
78 pmullw m13, m3 ; r4[i] = r3[i] * q
130 pmullw m14, m3 ; r4[i] = r3[i] * q
131 pmullw m13, m3 ; r4[i] = r3[i] * q

12345