/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 328 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 355 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 360 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 582 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 609 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 614 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 813 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 838 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 843 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
|
D | 5x5p2-minmax-neonfma-5x4.c | 299 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 326 vo4p0 = vfmaq_lane_f32(vo4p0, vi7x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 331 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 554 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 581 vo4p0 = vfmaq_lane_f32(vo4p0, vi7x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 586 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 787 const float32x4_t vi7x6789 = vextq_f32(vi7x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 812 vo4p0 = vfmaq_lane_f32(vo4p0, vi7x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 817 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
|
D | 5x5p2-minmax-neon-5x4.c | 299 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 326 vo4p0 = vmlaq_lane_f32(vo4p0, vi7x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 331 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 554 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 581 vo4p0 = vmlaq_lane_f32(vo4p0, vi7x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 586 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 787 const float32x4_t vi7x6789 = vextq_f32(vi7x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 812 vo4p0 = vmlaq_lane_f32(vo4p0, vi7x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 817 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 328 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 355 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 360 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 582 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 609 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 614 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 813 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 838 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 843 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4()
|
D | 5x5p2-minmax-sse-5x4.c | 336 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 358 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 362 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 589 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 611 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 615 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 819 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 841 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi7x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 845 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4()
|
D | 5x5p2-minmax-wasmsimd-x86-splat-5x4.c | 302 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 329 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 334 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 556 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 583 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 588 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 787 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 812 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 817 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4()
|
D | 5x5p2-minmax-wasmsimd-arm-splat-5x4.c | 302 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 329 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 334 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 556 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 583 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 588 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 787 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 812 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 817 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4()
|
D | 5x5p2-minmax-neon-4x4.c | 265 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 291 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 483 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 509 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 681 const float32x4_t vi7x6789 = vextq_f32(vi7x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 706 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 265 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 291 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 487 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 513 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 689 const float32x4_t vi7x6789 = vextq_f32(vi7x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 714 vo3p0 = vmlaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
|
D | 5x5p2-minmax-neonfma-4x4.c | 265 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 291 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 483 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 509 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 681 const float32x4_t vi7x6789 = vextq_f32(vi7x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 706 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 265 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 291 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 487 const float32x4_t vi7x6789 = vextq_f32(vi7x4567, vi7x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 513 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 689 const float32x4_t vi7x6789 = vextq_f32(vi7x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 714 vo3p0 = vfmaq_lane_f32(vo3p0, vi7x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 294 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 515 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 541 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 715 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 740 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 294 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 511 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 537 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 707 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 732 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 294 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 515 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 541 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 715 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 740 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 294 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 320 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 511 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 537 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 707 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 732 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
|
D | 5x5p2-minmax-sse-4x4.c | 298 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 319 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 512 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 533 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 706 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 727 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 298 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 319 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 516 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vi7x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 537 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 714 const __m128 vi7x6789 = _mm_shuffle_ps(vi7x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 735 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi7x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4-acc2.c | 268 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() local 294 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 489 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() local 515 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() 689 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2() local 714 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4.c | 268 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local 294 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 485 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local 511 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 681 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local 706 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4.c | 268 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local 294 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 485 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local 511 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 681 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local 706 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4-acc2.c | 268 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() local 294 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 489 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x4567, vi7x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() local 515 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() 689 const v128_t vi7x6789 = wasm_v32x4_shuffle(vi7x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2() local 714 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi7x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4_acc2()
|