Home
last modified time | relevance | path

Searched refs:vi4x3456 (Results 1 – 25 of 123) sorted by relevance

12345

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c153 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local
161 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
165 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
268 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local
276 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
280 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
284 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
D3x3p1-minmax-neon-5x4.c133 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local
141 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
146 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
151 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
256 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local
264 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
269 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
274 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
D3x3p1-minmax-neonfma-5x4.c133 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local
141 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
146 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
151 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
256 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local
264 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
269 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
274 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
D3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c153 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local
161 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
165 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
268 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local
276 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
280 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
284 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
D3x3p1-minmax-ssse3-5x4.c146 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local
154 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
158 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
162 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
262 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local
270 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
274 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
278 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
D3x3p1-minmax-sse-5x4.c180 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local
190 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
194 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
198 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
340 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local
350 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
354 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
358 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
D5x5p2-minmax-neon-4x4.c145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
158 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
162 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
166 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
170 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
363 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
376 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
380 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
384 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
388 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
[all …]
D5x5p2-minmax-neon-4x4-acc2.c145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
158 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
162 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
166 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
170 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
367 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
380 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
384 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
388 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
392 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
[all …]
D5x5p2-minmax-neonfma-4x4.c145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
162 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
166 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
170 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
363 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
376 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
380 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
384 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
388 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
[all …]
D5x5p2-minmax-neonfma-4x4-acc2.c145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
162 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
166 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
170 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
367 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
380 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
384 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
388 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
392 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
[all …]
D3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c167 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local
176 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
181 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
300 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local
309 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
314 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c167 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local
176 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
181 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
300 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local
309 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
314 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
D3x3p1-minmax-wasmsimd-arm-splat-5x4.c136 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() local
144 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
149 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
154 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
258 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() local
266 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
271 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
276 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
D3x3p1-minmax-wasmsimd-x86-splat-5x4.c136 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() local
144 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
149 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
154 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
258 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() local
266 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
271 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
276 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
D3x3p1-minmax-neon-6x4.c146 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local
155 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
161 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
167 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
287 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local
296 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
302 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
308 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
D3x3p1-minmax-ssse3-6x4.c160 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local
169 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
174 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
179 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
294 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local
303 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
308 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
313 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
D3x3p1-minmax-neonfma-6x4.c146 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local
155 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
161 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
167 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
287 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local
296 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
302 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
308 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
D5x5p2-minmax-neonfma-1x4-acc2.c100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
110 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
208 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
218 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
313 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
323 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
D5x5p2-minmax-neonfma-1x4-acc3.c100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
110 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
209 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
219 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
315 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
325 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
D3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c139 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local
149 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local
246 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
249 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
D3x3p1-minmax-neonfma-4x4.c120 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() local
131 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
135 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
225 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() local
236 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
240 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
D3x3p1-minmax-ssse3-4x4.c132 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local
142 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
145 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
230 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local
240 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
243 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
D5x5p2-minmax-neon-1x4.c100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
110 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
217 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
311 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
321 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c139 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local
149 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local
246 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
249 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
D5x5p2-minmax-neonfma-1x4.c100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
110 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
217 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
311 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
321 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()

12345