Home
last modified time | relevance | path

Searched refs:vk02 (Results 1 – 25 of 182) sorted by relevance

12345678

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-scalar-3x1.c37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local
181 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
182 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
183 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
308 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
309 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
310 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
397 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
398 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
399 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
D5x5p2-minmax-scalar-3x1-acc2.c37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local
181 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
182 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
183 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
311 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
312 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
313 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
403 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
404 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
405 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
D5x5s2p2-minmax-scalar-3x1.c38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local
220 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
221 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
222 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
334 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
335 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
336 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
411 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
412 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
413 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
D5x5s2p2-minmax-scalar-3x1-acc2.c38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local
220 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
221 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
222 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
337 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
338 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
339 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
417 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
418 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
419 vo2p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
D3x3p1-minmax-scalar-5x1.c37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local
155 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
156 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
157 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
158 vo3p0 += vi3x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
159 vo4p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
D5x5p2-minmax-scalar-2x1.c37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local
158 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
159 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
253 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
254 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
318 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
319 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
D3x3p1-minmax-scalar-6x1.c37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local
172 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
173 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
174 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
175 vo3p0 += vi3x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
176 vo4p0 += vi4x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
177 vo5p0 += vi5x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
D3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local
189 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
190 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
192 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
193 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
296 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
297 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
298 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
299 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local
209 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
210 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
211 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
212 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
213 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
214 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
333 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
334 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
335 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
[all …]
D3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local
209 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
210 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
211 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
212 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
213 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
214 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
333 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
334 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
335 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
[all …]
D3x3p1-minmax-ssse3-6x4.c40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local
202 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
203 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
204 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
205 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
206 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
207 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
328 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
329 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
330 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
[all …]
D5x5s2p2-minmax-scalar-2x1.c38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local
184 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
185 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
266 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
267 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
320 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
321 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
D5x5p2-minmax-scalar-2x1-acc3.c37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local
158 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
159 vo1p1 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
257 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
258 vo1p1 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
326 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
327 vo1p1 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
D5x5p2-minmax-scalar-2x1-acc2.c37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local
158 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
159 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
255 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
256 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
322 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
323 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
D5x5s2p2-minmax-scalar-2x1-acc2.c38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local
184 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
185 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
268 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
269 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
324 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
325 vo1p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
D5x5s2p2-minmax-scalar-2x1-acc3.c38 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local
184 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
185 vo1p1 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
270 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
271 vo1p1 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
328 vo0p1 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
329 vo1p1 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
D3x3p1-minmax-sse-6x4.c40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local
268 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
269 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
270 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
271 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
272 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
273 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
444 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
445 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
446 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local
189 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
190 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
192 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
193 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
296 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
297 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
298 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
299 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
[all …]
D3x3p1-minmax-ssse3-5x4.c40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local
182 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
183 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
184 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
185 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
186 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
291 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
292 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
293 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
294 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
[all …]
D3x3p1-minmax-sse-5x4.c40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local
240 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
241 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
242 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
243 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
244 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
393 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
394 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
395 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
396 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
[all …]
D3x3p1-minmax-scalar-4x1.c37 const float vk02 = weights[3]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local
138 vo0p0 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
139 vo1p0 += vi1x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
140 vo2p0 += vi2x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
141 vo3p0 += vi3x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
D3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local
169 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
170 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
171 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
172 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
259 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
260 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
261 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
262 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
D3x3p1-minmax-ssse3-4x4.c40 const __m128 vk02 = _mm_load1_ps(weights + 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local
162 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
163 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
164 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
165 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
254 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
255 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
256 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
257 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local
169 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
170 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
171 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
172 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
259 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
260 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
261 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
262 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-3x4.c45 const v128_t vk02 = wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() local
149 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
150 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
151 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
222 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
223 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
224 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()

12345678