Home
last modified time | relevance | path

Searched refs:vi5x2345 (Results 1 – 25 of 56) sorted by relevance

123

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neon-4x4.c185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
205 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
209 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
213 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
403 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
423 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
427 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
431 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
610 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
627 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
[all …]
D5x5p2-minmax-neon-4x4-acc2.c185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
205 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
209 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
213 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
407 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
427 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
431 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
435 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
618 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local
635 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
[all …]
D5x5p2-minmax-neonfma-4x4.c185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
205 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
209 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
213 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
403 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
423 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
427 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
431 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
610 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
627 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
[all …]
D5x5p2-minmax-neonfma-4x4-acc2.c185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
205 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
209 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
213 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
407 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
427 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
431 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
435 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
618 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local
635 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c235 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
254 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
259 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
264 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
269 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
489 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
508 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
513 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
518 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
523 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-neonfma-5x4.c206 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
225 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
230 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
235 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
240 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
461 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
480 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
485 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
490 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
495 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
[all …]
D5x5p2-minmax-neon-5x4.c206 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
225 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
230 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
235 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
240 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
461 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
480 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
485 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
490 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
495 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c235 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
254 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
259 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
264 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
269 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
489 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local
508 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
513 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
518 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
523 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
[all …]
D5x5p2-minmax-sse-5x4.c239 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
276 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
280 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
284 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
288 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
492 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local
529 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
533 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
537 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
541 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
435 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
455 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
459 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
463 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
645 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local
662 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
238 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
431 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
451 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
455 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
459 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
637 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
654 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
435 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
455 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
459 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
463 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
645 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local
662 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
238 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
431 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
451 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
455 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
459 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
637 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
654 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
[all …]
D5x5p2-minmax-neonfma-3x4.c164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local
183 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
186 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
345 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local
364 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
367 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
516 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local
533 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
536 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c193 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
212 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
215 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
373 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
392 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
395 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
543 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
560 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
563 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c193 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
212 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
215 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
376 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
395 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
398 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
549 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local
566 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
569 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c193 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local
212 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
215 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
373 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local
392 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
395 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
543 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local
560 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
563 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
D5x5p2-minmax-neon-3x4-acc2.c164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local
183 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
186 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
348 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local
367 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
370 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
522 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local
539 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
542 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
D5x5p2-minmax-neon-3x4.c164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local
183 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
186 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
345 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local
364 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
367 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
516 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local
533 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
536 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
D5x5p2-minmax-neonfma-3x4-acc2.c164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local
183 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
186 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
348 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local
367 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
370 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
522 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local
539 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
542 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
D5x5p2-minmax-wasmsimd-x86-splat-5x4.c209 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local
228 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
233 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
238 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
243 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
463 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local
482 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
487 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
492 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
497 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-5x4.c209 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local
228 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
233 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
238 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
243 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
463 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local
482 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
487 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
492 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
497 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
[all …]
D5x5p2-minmax-sse-4x4.c216 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
251 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
254 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
257 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
430 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
465 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
468 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
471 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
634 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
659 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
[all …]
D5x5p2-minmax-sse-4x4-acc2.c216 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
251 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
254 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
257 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
434 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
469 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
472 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
475 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
642 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local
667 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
[all …]
D5x5p2-minmax-sse-3x4-acc2.c193 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
224 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
226 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
371 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
402 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
404 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
540 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local
563 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
565 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()

123