Home
last modified time | relevance | path

Searched refs:vi6x5678 (Results 1 – 25 of 51) sorted by relevance

123

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-sse-5x4.c299 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
317 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
321 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
325 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
335 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
552 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
570 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
574 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
578 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
588 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
[all …]
D3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c206 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local
220 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
225 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
330 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local
344 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
349 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c206 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local
220 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
225 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
330 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local
344 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
349 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
D3x3p1-minmax-neon-6x4.c187 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local
202 vo5p0 = vmlaq_lane_f32(vo5p0, vi6x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
208 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
320 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local
335 vo5p0 = vmlaq_lane_f32(vo5p0, vi6x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
341 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
D3x3p1-minmax-ssse3-6x4.c199 …const __m128 vi6x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi6x89AB), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local
213 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
218 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
325 … const __m128 vi6x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi6x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local
339 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
344 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
D3x3p1-minmax-neonfma-6x4.c187 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local
202 vo5p0 = vfmaq_lane_f32(vo5p0, vi6x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
208 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
320 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local
335 vo5p0 = vfmaq_lane_f32(vo5p0, vi6x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
341 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c280 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
300 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
305 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
310 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
534 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
554 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
559 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
564 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
772 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
792 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-neonfma-5x4.c251 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
271 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
276 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
281 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
506 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
526 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
531 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
536 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
746 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
766 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
[all …]
D5x5p2-minmax-neon-5x4.c251 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
271 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
276 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
281 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
506 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
526 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
531 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
536 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
746 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
766 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c280 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
300 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
305 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
310 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
534 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
554 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
559 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
564 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
772 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
792 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
[all …]
D3x3p1-minmax-sse-6x4.c264 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local
279 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
284 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
440 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local
455 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
460 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
D5x5p2-minmax-sse-4x4.c267 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
285 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
288 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
297 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
481 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
499 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
502 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
511 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
675 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
693 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
[all …]
D5x5p2-minmax-sse-4x4-acc2.c267 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
285 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
288 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
297 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
485 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
503 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
506 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
515 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
683 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
701 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
[all …]
D5x5p2-minmax-neon-4x4.c223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
244 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
248 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
441 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
462 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
466 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
646 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
667 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
671 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
680 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
D5x5p2-minmax-neon-4x4-acc2.c223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
244 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
248 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
445 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
466 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
470 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
654 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
675 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
679 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
688 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
D5x5p2-minmax-neonfma-4x4.c223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
244 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
248 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
441 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
462 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
466 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
646 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
667 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
671 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
680 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
D5x5p2-minmax-neonfma-4x4-acc2.c223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
244 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
248 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
445 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
466 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
470 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
654 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
675 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
679 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
688 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
D5x5p2-minmax-sse-3x4-acc2.c235 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
251 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
259 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
413 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
429 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
437 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
574 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
590 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
598 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
D5x5p2-minmax-sse-3x4.c235 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
251 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
259 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
410 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
426 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
434 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
568 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
584 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
592 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
D3x3p1-minmax-wasmsimd-arm-splat-6x4.c190 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local
205 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
211 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
321 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local
336 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
342 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
D3x3p1-minmax-wasmsimd-x86-splat-6x4.c190 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local
205 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
211 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
321 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local
336 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
342 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
473 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
494 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
498 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
680 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
701 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
705 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
714 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
277 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
469 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
490 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
494 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
672 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
693 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
697 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
706 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
473 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
494 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
498 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
680 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
701 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
705 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
714 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
277 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
469 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
490 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
494 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
672 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
693 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
697 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
706 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()

123