Home
last modified time | relevance | path

Searched refs:vi5x8ACE (Results 1 – 25 of 51) sorted by relevance

123

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() local
174 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
349 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
370 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
395 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
441 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() local
174 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
351 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
372 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
397 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
443 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() local
174 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
351 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
372 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
397 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
443 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() local
174 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
349 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
370 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
395 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
441 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() local
174 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
353 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
374 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
399 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
445 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4-acc3.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() local
174 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
353 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
374 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
399 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
445 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
D5x5s2p2-minmax-sse-2x4-acc2.c150 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() local
164 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
173 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
301 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
302 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
353 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
365 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
374 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
440 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
D5x5s2p2-minmax-sse-2x4-acc3.c150 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() local
164 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
173 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
301 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
302 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
355 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
367 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
376 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
442 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
D5x5s2p2-minmax-sse-2x4.c150 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() local
164 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
173 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
301 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
302 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
351 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
363 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
372 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
438 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
D5x5s2p2-minmax-wasmsimd-x86-splat-2x4-acc2.c134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() local
154 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
331 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
352 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
377 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
423 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
D5x5s2p2-minmax-wasmsimd-x86-splat-2x4.c134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() local
154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
329 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
350 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
375 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
421 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
D5x5s2p2-minmax-wasmsimd-arm-splat-2x4.c134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() local
154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
329 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
350 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
375 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
421 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
D5x5s2p2-minmax-wasmsimd-arm-splat-2x4-acc2.c134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() local
154 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
331 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
352 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
377 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
423 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
D5x5s2p2-minmax-wasmsimd-arm-splat-2x4-acc3.c134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() local
154 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
333 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
354 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
379 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
425 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
D5x5s2p2-minmax-wasmsimd-x86-splat-2x4-acc3.c134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() local
154 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
333 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
354 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
379 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
425 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() local
195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
202 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
240 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
430 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
451 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
458 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4.c174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() local
195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
202 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
240 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
427 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
448 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
455 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() local
195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
202 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
240 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
430 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
451 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
458 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4.c174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() local
195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
202 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
240 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
427 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
448 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
455 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
[all …]
D5x5s2p2-minmax-sse-3x4.c170 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() local
186 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
191 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
202 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
369 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
434 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
448 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
453 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
464 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4()
[all …]
D5x5s2p2-minmax-sse-3x4-acc2.c170 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() local
186 __m128 vo2p1 = _mm_mul_ps(vi5x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
191 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
202 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
369 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
437 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
451 __m128 vo2p1 = _mm_mul_ps(vi5x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
456 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
467 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-splat-3x4-acc2.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() local
175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
182 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
219 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
220 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
349 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
350 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
410 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
431 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
438 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-splat-3x4-acc2.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() local
175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
182 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
219 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
220 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
349 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
350 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
410 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
431 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
438 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-splat-3x4.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() local
175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
182 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
219 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
220 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
349 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
350 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
407 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
428 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
435 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-splat-3x4.c154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() local
175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
182 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
219 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
220 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
349 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
350 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
407 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
428 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
435 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
[all …]

123