/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neon-4x4.c | 261 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 281 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 285 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 289 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 479 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 499 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 503 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 507 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 679 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 696 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() [all …]
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 261 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 281 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 285 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 289 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 483 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 503 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 507 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 511 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 687 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 704 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-4x4.c | 261 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 281 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 285 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 289 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 479 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 499 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 503 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 507 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 679 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 696 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() [all …]
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 261 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 281 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 285 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 289 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 483 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 503 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 507 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 511 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 687 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 704 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 324 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 343 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 348 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 353 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 358 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 578 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 597 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 602 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 607 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 612 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-neonfma-5x4.c | 295 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 314 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 319 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 324 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 329 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 550 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 569 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 574 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 579 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 584 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() [all …]
|
D | 5x5p2-minmax-neon-5x4.c | 295 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 314 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 319 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 324 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 329 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 550 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 569 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 574 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 579 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 584 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 324 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 343 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 348 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 353 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 358 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 578 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 597 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 602 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 607 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 612 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-sse-5x4.c | 334 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 348 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 352 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 356 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 360 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 587 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 601 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 605 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 609 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 613 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 290 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 310 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 314 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 511 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 531 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 535 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 539 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 713 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 730 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 290 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 310 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 314 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 507 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 527 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 531 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 535 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 705 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 722 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 290 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 310 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 314 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 511 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 531 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 535 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 539 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 713 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 730 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 290 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 310 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 314 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 507 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 527 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 531 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 535 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 705 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 722 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-neonfma-3x4.c | 227 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 246 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 249 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 408 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 427 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 430 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 573 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 590 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 593 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 256 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 275 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 436 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 455 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 458 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 599 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 616 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 619 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 256 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 275 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 439 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 458 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 461 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 605 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 622 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 625 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 256 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 275 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 278 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 436 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 455 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 458 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 599 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 616 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 619 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
|
D | 5x5p2-minmax-neon-3x4-acc2.c | 227 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 246 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 249 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 411 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 430 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 433 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 579 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 596 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 599 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
|
D | 5x5p2-minmax-neon-3x4.c | 227 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 246 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 249 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 408 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 427 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 430 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 573 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 590 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 593 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
|
D | 5x5p2-minmax-neonfma-3x4-acc2.c | 227 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 246 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 249 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 411 const float32x4_t vi5x6789 = vextq_f32(vi5x4567, vi5x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 430 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 433 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 579 const float32x4_t vi5x6789 = vextq_f32(vi5x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 596 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 599 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-splat-5x4.c | 298 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 317 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 322 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 327 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 332 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 552 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 571 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 576 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 581 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 586 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-5x4.c | 298 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 317 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 322 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 327 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 332 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 552 const v128_t vi5x6789 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 571 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 576 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 581 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 586 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() [all …]
|
D | 5x5p2-minmax-sse-4x4.c | 296 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 311 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 314 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 317 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 510 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 525 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 528 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 531 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 704 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 719 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() [all …]
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 296 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 311 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 314 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 317 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 514 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 529 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 532 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 535 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 712 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 727 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() [all …]
|
D | 5x5p2-minmax-sse-3x4-acc2.c | 258 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 272 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 274 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 436 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 450 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 452 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 597 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 611 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 613 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
|