/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 153 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 161 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 165 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 268 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 276 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 280 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 284 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
|
D | 3x3p1-minmax-neon-5x4.c | 133 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local 141 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 146 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 151 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 256 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local 264 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 269 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 274 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
|
D | 3x3p1-minmax-neonfma-5x4.c | 133 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local 141 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 146 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 151 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 256 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local 264 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 269 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 274 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c | 153 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 161 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 165 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 268 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 276 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 280 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 284 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
|
D | 3x3p1-minmax-ssse3-5x4.c | 146 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local 154 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 158 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 162 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 262 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local 270 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 274 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 278 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
|
D | 3x3p1-minmax-sse-5x4.c | 180 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local 190 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 194 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 198 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 340 const __m128 vi4x3456 = _mm_move_ss(vi4x7456, vi4x3012); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local 350 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 354 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 358 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
|
D | 5x5p2-minmax-neon-4x4.c | 145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 158 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 162 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 166 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 170 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 363 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 376 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 380 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 384 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 388 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() [all …]
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 158 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 162 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 166 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 170 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 367 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 380 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 384 vo2p1 = vmlaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 388 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 392 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-4x4.c | 145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 162 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 166 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 170 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 363 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 376 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 380 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 384 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 388 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() [all …]
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 145 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 158 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 162 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 166 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 170 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 367 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 380 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 384 vo2p1 = vfmaq_lane_f32(vo2p1, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 388 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 392 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 167 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 176 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 181 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 300 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 309 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 314 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 167 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 176 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 181 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 300 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 309 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 314 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-5x4.c | 136 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() local 144 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 149 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 154 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 258 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() local 266 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 271 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 276 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-5x4.c | 136 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() local 144 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 149 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 154 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 258 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() local 266 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 271 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 276 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
|
D | 3x3p1-minmax-neon-6x4.c | 146 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 155 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 161 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 167 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 287 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 296 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 302 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 308 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
|
D | 3x3p1-minmax-ssse3-6x4.c | 160 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 169 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 174 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 179 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 294 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 303 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 308 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 313 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
|
D | 3x3p1-minmax-neonfma-6x4.c | 146 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 155 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 161 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 167 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 287 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 296 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x3456, vget_low_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 302 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 308 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
|
D | 5x5p2-minmax-neonfma-1x4-acc2.c | 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 110 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 208 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 218 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 313 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 323 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
|
D | 5x5p2-minmax-neonfma-1x4-acc3.c | 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 110 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 209 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 219 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 315 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 325 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 139 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 149 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 246 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 249 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
|
D | 3x3p1-minmax-neonfma-4x4.c | 120 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() local 131 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 135 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 225 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() local 236 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x3456, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 240 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
|
D | 3x3p1-minmax-ssse3-4x4.c | 132 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 142 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 145 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 230 …const __m128 vi4x3456 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x4567), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 240 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 243 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
|
D | 5x5p2-minmax-neon-1x4.c | 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 110 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 217 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 311 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 321 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 139 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 149 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 236 const v128_t vi4x3456 = wasm_v32x4_shuffle(vi4x0123, vi4x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 246 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x3456, vk10)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 249 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
|
D | 5x5p2-minmax-neonfma-1x4.c | 100 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 110 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 207 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 217 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 311 const float32x4_t vi4x3456 = vextq_f32(vi4x0123, vi4x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 321 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
|