/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neon-4x4.c | 185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 205 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 209 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 213 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 403 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 423 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 427 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 431 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 610 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 627 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() [all …]
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 205 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 209 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 213 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 407 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 427 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 431 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 435 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 618 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 635 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-4x4.c | 185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 205 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 209 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 213 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 403 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 423 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 427 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 431 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 610 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 627 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() [all …]
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 185 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 205 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 209 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 213 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 407 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 427 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 431 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 435 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 618 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 635 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 235 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 254 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 259 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 264 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 269 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 489 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 508 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 513 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 518 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 523 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-neonfma-5x4.c | 206 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 225 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 230 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 235 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 240 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 461 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 480 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 485 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 490 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 495 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() [all …]
|
D | 5x5p2-minmax-neon-5x4.c | 206 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 225 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 230 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 235 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 240 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 461 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 480 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x2345, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 485 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 490 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 495 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 235 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 254 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 259 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 264 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 269 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 489 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 508 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 513 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 518 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 523 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-sse-5x4.c | 239 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 276 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 280 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 284 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 288 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 492 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 529 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 533 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 537 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 541 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 435 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 455 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 459 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 463 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 645 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 662 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 238 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 431 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 451 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 455 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 459 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 637 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 654 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 238 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 435 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 455 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 459 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 463 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 645 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 662 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 214 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 234 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 238 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 242 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 431 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 451 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 455 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 459 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 637 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 654 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-neonfma-3x4.c | 164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 183 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 186 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 345 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 364 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 367 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 516 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 533 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 536 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 193 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 212 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 215 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 373 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 392 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 395 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 543 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 560 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 563 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 193 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 212 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 215 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 376 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 395 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 398 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 549 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 566 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 569 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 193 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 212 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 215 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 373 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 392 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 395 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 543 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 560 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 563 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
|
D | 5x5p2-minmax-neon-3x4-acc2.c | 164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 183 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 186 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 348 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 367 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 370 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 522 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 539 vo2p1 = vmlaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 542 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
|
D | 5x5p2-minmax-neon-3x4.c | 164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 183 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 186 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 345 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 364 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 367 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 516 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 533 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 536 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
|
D | 5x5p2-minmax-neonfma-3x4-acc2.c | 164 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 183 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 186 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 348 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 367 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 370 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 522 const float32x4_t vi5x2345 = vextq_f32(vi5x0123, vi5x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 539 vo2p1 = vfmaq_lane_f32(vo2p1, vi5x2345, vget_low_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 542 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x2345, vget_low_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-splat-5x4.c | 209 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 228 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 233 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 238 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 243 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 463 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() local 482 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 487 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 492 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() 497 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-5x4.c | 209 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 228 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 233 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 238 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 243 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 463 const v128_t vi5x2345 = wasm_v32x4_shuffle(vi5x0123, vi5x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() local 482 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 487 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 492 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() 497 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_5x4() [all …]
|
D | 5x5p2-minmax-sse-4x4.c | 216 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 251 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 254 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 257 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 430 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 465 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 468 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 471 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 634 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 659 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() [all …]
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 216 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 251 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 254 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 257 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 434 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 469 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 472 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 475 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 642 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 667 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() [all …]
|
D | 5x5p2-minmax-sse-3x4-acc2.c | 193 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 224 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 226 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 371 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 402 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 404 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 540 const __m128 vi5x2345 = _mm_shuffle_ps(vi5x3012, vi5x7456, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 563 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi5x2345, vk30)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 565 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2()
|