Home
last modified time | relevance | path

Searched refs:vi3x4567 (Results 1 – 25 of 178) sorted by relevance

12345678

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neonfma-1x4-acc2.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local
92 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
200 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
207 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
[all …]
D5x5p2-minmax-neonfma-1x4-acc3.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
92 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
201 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
208 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
[all …]
D5x5p2-minmax-neon-1x4.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local
92 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
199 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
206 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
[all …]
D5x5p2-minmax-neonfma-1x4.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local
92 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
199 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
206 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
[all …]
D5x5p2-minmax-neon-1x4-acc3.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local
92 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
201 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
208 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
[all …]
D5x5p2-minmax-neon-1x4-acc2.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local
92 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
200 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
207 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4() local
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
227 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
234 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc2.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2() local
121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
228 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
235 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc2.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2() local
121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
228 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
235 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local
121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
230 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
237 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() local
121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
229 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
236 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
[all …]
D5x5p2-minmax-neonfma-1x4-acc4.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() local
92 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
202 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
209 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local
95 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
204 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
211 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc5.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5() local
121 v128_t vo0p4 = wasm_f32x4_mul(vi3x4567, vk32); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
231 v128_t vo0p4 = wasm_f32x4_mul(vi3x4567, vk32); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
238 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc5()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local
121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
229 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
236 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4.c76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4() local
95 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
201 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
208 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc2.c76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2() local
95 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
202 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
209 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc2()
[all …]
D5x5p2-minmax-neonfma-1x4-acc5.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5() local
92 float32x4_t vo0p4 = vmulq_lane_f32(vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
203 float32x4_t vo0p4 = vmulq_lane_f32(vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
210 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc5()
[all …]
D5x5p2-minmax-neon-1x4-acc5.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5() local
92 float32x4_t vo0p4 = vmulq_lane_f32(vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
203 float32x4_t vo0p4 = vmulq_lane_f32(vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
210 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc5()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local
121 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
230 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
237 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4.c76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4() local
95 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
201 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
208 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4()
[all …]
D5x5p2-minmax-neon-1x4-acc4.c73 float32x4_t vi3x4567 = vld1q_f32(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local
92 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
99 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
118 const float32x4_t vi3x2345 = vextq_f32(vi3x0123, vi3x4567, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
119 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
136 const float32x4_t vi3x5678 = vextq_f32(vi3x4567, vi3x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
156 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
202 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
209 const float32x4_t vi3x3456 = vextq_f32(vi3x0123, vi3x4567, 3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local
95 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
203 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
210 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4-acc2.c76 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2() local
95 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
102 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
121 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
122 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
139 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
158 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
159 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
202 …vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
209 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc2()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4.c102 v128_t vi3x4567 = wasm_v128_load(i3); i3 += 4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4() local
121 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
128 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
147 const v128_t vi3x2345 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
148 vi3x0123 = vi3x4567; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
165 const v128_t vi3x5678 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
184 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
185 vi3x4567 = vi3x89AB; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
227 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
234 const v128_t vi3x3456 = wasm_v32x4_shuffle(vi3x0123, vi3x4567, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4()
[all …]

12345678