/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-sse-5x4.c | 299 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 317 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 321 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 325 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 335 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 552 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 570 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 574 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 578 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 588 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 206 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 220 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 225 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 330 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 344 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 349 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 206 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 220 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 225 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 330 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 344 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 349 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
|
D | 3x3p1-minmax-neon-6x4.c | 187 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 202 vo5p0 = vmlaq_lane_f32(vo5p0, vi6x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 208 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 320 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 335 vo5p0 = vmlaq_lane_f32(vo5p0, vi6x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 341 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
|
D | 3x3p1-minmax-ssse3-6x4.c | 199 …const __m128 vi6x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi6x89AB), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 213 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 218 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 325 … const __m128 vi6x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi6x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 339 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 344 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
|
D | 3x3p1-minmax-neonfma-6x4.c | 187 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 202 vo5p0 = vfmaq_lane_f32(vo5p0, vi6x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 208 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 320 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 335 vo5p0 = vfmaq_lane_f32(vo5p0, vi6x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 341 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 280 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 300 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 305 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 310 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 534 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 554 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 559 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 564 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 772 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 792 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-neonfma-5x4.c | 251 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 271 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 276 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 281 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 506 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 526 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 531 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 536 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 746 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 766 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() [all …]
|
D | 5x5p2-minmax-neon-5x4.c | 251 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 271 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 276 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 281 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 506 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 526 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 531 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 536 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 746 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 766 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 280 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 300 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 305 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 310 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 534 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 554 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 559 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 564 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 772 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 792 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 3x3p1-minmax-sse-6x4.c | 264 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 279 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 284 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 440 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 455 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi6x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 460 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
|
D | 5x5p2-minmax-sse-4x4.c | 267 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 285 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 288 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 297 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 481 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 499 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 502 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 511 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 675 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 693 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() [all …]
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 267 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 285 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 288 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 297 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 485 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 503 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 506 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 515 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 683 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 701 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() [all …]
|
D | 5x5p2-minmax-neon-4x4.c | 223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 244 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 248 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 441 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 462 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 466 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 646 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 667 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 671 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 680 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 244 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 248 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 445 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 466 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 470 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 654 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 675 vo3p0 = vmlaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 679 vo2p1 = vmlaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 688 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
|
D | 5x5p2-minmax-neonfma-4x4.c | 223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 244 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 248 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 441 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 462 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 466 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 646 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 667 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 671 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 680 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 244 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 248 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 445 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 466 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 470 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 654 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 675 vo3p0 = vfmaq_lane_f32(vo3p0, vi6x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 679 vo2p1 = vfmaq_lane_f32(vo2p1, vi6x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 688 const float32x4_t vi6x6789 = vextq_f32(vi6x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
|
D | 5x5p2-minmax-sse-3x4-acc2.c | 235 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 251 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 259 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 413 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 429 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 437 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 574 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 590 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 598 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
|
D | 5x5p2-minmax-sse-3x4.c | 235 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 251 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 259 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 410 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 426 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 434 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vi6x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 568 const __m128 vi6x5678 = _mm_shuffle_ps(vi6x8567, vi6x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 584 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 592 const __m128 vi6x6789 = _mm_shuffle_ps(vi6x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-6x4.c | 190 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local 205 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 211 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 321 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local 336 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 342 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-6x4.c | 190 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local 205 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 211 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 321 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local 336 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 342 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 473 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 494 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 498 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 680 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 701 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 705 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 714 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 277 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 469 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 490 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 494 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 672 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 693 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 697 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 706 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 277 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 473 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 494 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 498 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 680 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 701 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 705 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 714 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 252 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 273 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 277 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 469 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 490 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 494 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 672 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 693 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi6x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 697 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 706 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
|