/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neonfma-1x4-acc2.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 92 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 200 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 207 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc3.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 92 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 201 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 208 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neon-1x4.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 92 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 199 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 206 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() [all …]
|
D | 5x5p2-minmax-neonfma-1x4.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 92 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 199 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 206 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc3.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 92 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 201 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 208 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc2.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 92 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 200 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 207 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() local 121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 227 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() 234 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() local 121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 228 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() 235 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() local 121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 228 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() 235 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local 121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 230 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 237 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() local 121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 229 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 236 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc4.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() local 92 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 202 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 209 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c | 76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local 95 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 204 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 211 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc5.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() local 121 v128_t vo0p4 = wasm_f32x4_mul(vi3x4567, vk32); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 231 v128_t vo0p4 = wasm_f32x4_mul(vi3x4567, vk32); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() 238 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local 121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 229 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 236 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4.c | 76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() local 95 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 201 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() 208 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c | 76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local 95 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 202 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() 209 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc5.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() local 92 float32x4_t vo0p4 = vmulq_lane_f32(vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 203 float32x4_t vo0p4 = vmulq_lane_f32(vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() 210 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc5.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() local 92 float32x4_t vo0p4 = vmulq_lane_f32(vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 203 float32x4_t vo0p4 = vmulq_lane_f32(vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() 210 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local 121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 230 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 237 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4.c | 76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() local 95 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 201 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() 208 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc4.c | 73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local 92 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 202 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 209 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c | 76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local 95 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 203 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 210 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4-acc2.c | 76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() local 95 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 202 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() 209 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4.c | 102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() local 121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 227 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() 234 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() [all …]
|