Home
last modified time | relevance | path

Searched refs:vi6x3456 (Results 1 – 25 of 51) sorted by relevance

123

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c169 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local
183 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
188 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
302 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local
316 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
321 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
D3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c169 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local
183 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
188 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
302 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local
316 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
321 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
D3x3p1-minmax-neon-6x4.c148 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local
163 vo5p0 = vmlaq_lane_f32(vo5p0, vi6x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
169 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
289 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local
304 vo5p0 = vmlaq_lane_f32(vo5p0, vi6x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
310 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
D3x3p1-minmax-ssse3-6x4.c162 …const __m128 vi6x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi6x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local
176 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
181 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
296 …const __m128 vi6x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi6x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local
310 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
315 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
D3x3p1-minmax-neonfma-6x4.c148 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local
163 vo5p0 = vfmaq_lane_f32(vo5p0, vi6x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
169 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
289 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local
304 vo5p0 = vfmaq_lane_f32(vo5p0, vi6x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
310 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
D3x3p1-minmax-sse-6x4.c202 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local
217 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
222 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
386 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local
401 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
406 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
D3x3p1-minmax-wasmsimd-arm-splat-6x4.c151 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local
166 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
172 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
291 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local
306 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
312 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
D3x3p1-minmax-wasmsimd-x86-splat-6x4.c151 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local
166 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
172 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
291 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local
306 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
312 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c191 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
211 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
216 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
221 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
445 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
465 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
470 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
475 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
692 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
712 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-neonfma-5x4.c162 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
182 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
187 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
192 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
417 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
437 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
442 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
447 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
665 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
685 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
[all …]
D5x5p2-minmax-neon-5x4.c162 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
182 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
187 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
192 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
417 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
437 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
442 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
447 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
665 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
685 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c191 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
211 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
216 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
221 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
445 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
465 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
470 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
475 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
692 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
712 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
[all …]
D5x5p2-minmax-neon-4x4.c147 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
168 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
172 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
365 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
386 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
390 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
577 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
598 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
602 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
D5x5p2-minmax-neon-4x4-acc2.c147 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
168 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
172 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
369 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
390 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
394 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
585 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
606 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
610 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
D5x5p2-minmax-neonfma-4x4.c147 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
168 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
172 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
365 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
386 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
390 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
577 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
598 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
602 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
D5x5p2-minmax-neonfma-4x4-acc2.c147 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
168 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
172 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
369 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
390 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
394 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
585 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
606 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
610 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
D5x5p2-minmax-sse-5x4.c199 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
217 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
221 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
225 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
452 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
470 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
474 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
478 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
699 const __m128 vi6x3456 = _mm_move_ss(vi6x7456, vi6x3012); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
717 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
[all …]
D3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c155 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local
171 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
270 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local
286 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
D3x3p1-minmax-neon-5x4.c135 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local
153 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
258 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local
276 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
D3x3p1-minmax-neonfma-5x4.c135 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local
153 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
258 const float32x4_t vi6x3456 = vextq_f32(vi6x0123, vi6x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local
276 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
D5x5p2-minmax-wasmsimd-x86-splat-5x4.c165 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local
185 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
190 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
419 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local
439 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
444 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
449 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
666 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local
686 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-5x4.c165 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local
185 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
190 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
419 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local
439 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
444 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
449 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
666 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local
686 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c176 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
201 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
397 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
418 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
422 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
612 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
633 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
637 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c176 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
393 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
414 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
418 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
604 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
625 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
629 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c176 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
201 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
397 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
418 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
422 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
612 const v128_t vi6x3456 = wasm_v32x4_shuffle(vi6x0123, vi6x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
633 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
637 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()

123