/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3s2p1-minmax-wasmsimd-x86-loadsplat-3x4.c | 127 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() local 143 const v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 144 const v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 228 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() local 243 …const v128_t vi6x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 244 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-loadsplat-3x4.c | 127 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() local 143 const v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 144 const v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 228 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() local 243 …const v128_t vi6x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 244 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-3x4.c | 117 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local 133 const v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 134 const v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 218 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local 233 …const v128_t vi6x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 234 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-3x4.c | 117 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() local 133 const v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 134 const v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 218 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() local 233 …const v128_t vi6x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 234 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
|
D | 3x3s2p1-minmax-sse-3x4.c | 117 const __m128 vi6x89AB = _mm_loadu_ps(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_3x4() local 133 const __m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_3x4() 134 const __m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_3x4() 221 const __m128 vi6x89AB = _mm_loadu_ps(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_3x4() local 236 …const __m128 vi6x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_3x4() 237 …const __m128 vi6x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 140 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() local 162 const v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 163 const v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 268 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() local 287 …const v128_t vi6x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 288 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4()
|
D | 5x5p2-minmax-neonfma-3x4.c | 104 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 195 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 229 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 230 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 277 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 285 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB))); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 376 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 410 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 411 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 133 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 224 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 258 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 259 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 305 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 313 vi6x89AB = wasm_v128_and(vmask, vi6x89AB); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 404 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 438 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 439 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 133 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 224 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 258 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 259 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 308 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 316 vi6x89AB = wasm_v128_and(vmask, vi6x89AB); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 407 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 441 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 442 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 133 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 224 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 258 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 259 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 305 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 313 vi6x89AB = wasm_v128_and(vmask, vi6x89AB); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 404 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 438 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 439 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
|
D | 5x5p2-minmax-neon-3x4-acc2.c | 104 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 195 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 229 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 230 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 280 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 288 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB))); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 379 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 413 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 414 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2()
|
D | 5x5p2-minmax-neon-3x4.c | 104 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 195 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 229 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 230 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 277 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 285 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB))); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 376 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 410 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 411 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
|
D | 5x5p2-minmax-neonfma-3x4-acc2.c | 104 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 195 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 229 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 230 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 280 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 288 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB))); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 379 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 413 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 414 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2()
|
D | 3x3s2p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 140 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() local 162 const v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 163 const v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 268 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() local 287 …const v128_t vi6x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 288 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-4x4.c | 130 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() local 152 const v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 153 const v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 258 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() local 277 …const v128_t vi6x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 278 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-4x4.c | 130 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() local 152 const v128_t vi6x8ACE = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 153 const v128_t vi6x9BDF = wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, 7); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 258 const v128_t vi6x89AB = wasm_v128_load(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() local 277 …const v128_t vi6x8ACE = wasm_v128_and(vmask_even, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 0, 2, 4, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 278 …const v128_t vi6x9BDF = wasm_v128_and(vmask_odd, wasm_v32x4_shuffle(vi6x89AB, vi6xCDEF, 1, 3, 5, … in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
|
D | 3x3s2p1-minmax-sse-4x4.c | 129 const __m128 vi6x89AB = _mm_loadu_ps(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() local 151 const __m128 vi6x8ACE = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 152 const __m128 vi6x9BDF = _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1, 3, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 262 const __m128 vi6x89AB = _mm_loadu_ps(i6); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() local 281 …const __m128 vi6x8ACE = _mm_and_ps(vmask_even, _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(2, 0… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4() 282 …const __m128 vi6x9BDF = _mm_and_ps(vmask_odd, _mm_shuffle_ps(vi6x89AB, vi6xCDEF, _MM_SHUFFLE(3, 1… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_4x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c | 133 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local 224 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 258 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 259 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 308 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() local 316 vi6x89AB = wasm_v128_and(vmask, vi6x89AB); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 407 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 441 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2() 442 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4_acc2()
|
D | 5x5p2-minmax-neon-4x4.c | 113 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 263 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 264 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 322 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 331 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB))); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 441 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 481 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 482 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
|
D | 5x5p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 107 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() local 198 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 232 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 233 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 282 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() local 290 vi6x89AB = wasm_v128_and(vmask, vi6x89AB); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 381 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 415 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2() 416 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4.c | 107 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() local 198 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 232 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 233 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 279 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() local 287 vi6x89AB = wasm_v128_and(vmask, vi6x89AB); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 378 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 412 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 413 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 113 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 263 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 264 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 326 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 335 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB))); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 445 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 485 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 486 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2()
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 107 const v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() local 198 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 232 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 233 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 282 v128_t vi6x89AB = wasm_v128_load(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() local 290 vi6x89AB = wasm_v128_and(vmask, vi6x89AB); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 381 const v128_t vi6x5678 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 415 const v128_t vi6x6789 = wasm_v32x4_shuffle(vi6x4567, vi6x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2() 416 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4_acc2()
|
D | 5x5p2-minmax-neonfma-4x4.c | 113 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 263 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 264 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 322 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 331 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB))); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 441 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 481 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 482 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 113 const float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 223 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 263 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 264 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 326 float32x4_t vi6x89AB = vld1q_f32(i6); i6 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 335 vi6x89AB = vreinterpretq_f32_u32(vandq_u32(vmask, vreinterpretq_u32_f32(vi6x89AB))); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 445 const float32x4_t vi6x5678 = vextq_f32(vi6x4567, vi6x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 485 const float32x4_t vi6x6789 = vextq_f32(vi6x4567, vi6x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 486 vi6x4567 = vi6x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2()
|