Home
last modified time | relevance | path

Searched refs:vo0p2 (Results 1 – 25 of 98) sorted by relevance

1234

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-neonfma-1x4-acc3.c88 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local
92 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
104 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
110 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
127 vo0p2 = vfmaq_lane_f32(vo0p2, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
139 vo0p2 = vfmaq_lane_f32(vo0p2, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
145 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
162 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
168 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
171 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
[all …]
D5x5p2-minmax-neon-1x4-acc3.c88 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local
92 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
104 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
110 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
127 vo0p2 = vmlaq_lane_f32(vo0p2, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
139 vo0p2 = vmlaq_lane_f32(vo0p2, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
145 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
162 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
168 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
171 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c117 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() local
121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
133 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
139 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
168 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
174 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
191 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
197 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
200 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c117 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local
121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
133 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
139 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
168 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
174 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
191 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
197 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
200 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local
95vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
107vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
113vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
130vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
142vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
148vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
165vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
171vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
174 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4-acc3.c91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() local
95vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
107vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
113vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
130vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
142vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
148vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
165vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
171vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
174 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c117 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local
123 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
137 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
170 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
189 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
197 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
200 vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
201 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4()
226 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local
[all …]
D5x5p2-minmax-neonfma-1x4-acc4.c88 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() local
94 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
108 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
127 vo0p2 = vfmaq_lane_f32(vo0p2, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
141 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
160 vo0p2 = vfmaq_lane_f32(vo0p2, vi0x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
168 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
171 vo0p2 = vaddq_f32(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
172 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4()
198 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() local
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local
97vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
111vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
130vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
144vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
163vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
171vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
174 vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
175 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4()
200 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c117 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local
123 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
137 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
170 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
189 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
197 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
200 vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
201 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4()
226 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local
[all …]
D5x5p2-minmax-neon-1x4-acc4.c88 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local
94 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
108 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
127 vo0p2 = vmlaq_lane_f32(vo0p2, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
141 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
160 vo0p2 = vmlaq_lane_f32(vo0p2, vi0x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
168 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
171 vo0p2 = vaddq_f32(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
172 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4()
198 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local
[all …]
D5x5s2p2-minmax-neon-1x4-acc3.c96 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() local
100 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
106 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
112 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
129 vo0p2 = vmlaq_lane_f32(vo0p2, vi2x68AC, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
152 vo0p2 = vmlaq_lane_f32(vo0p2, vi0x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
158 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
175 vo0p2 = vmlaq_lane_f32(vo0p2, vi1xACEG, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
181 vo0p2 = vmlaq_lane_f32(vo0p2, vi4xACEG, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
184 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3()
[all …]
D5x5s2p2-minmax-neonfma-1x4-acc3.c96 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() local
100 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
106 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
112 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
129 vo0p2 = vfmaq_lane_f32(vo0p2, vi2x68AC, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
152 vo0p2 = vfmaq_lane_f32(vo0p2, vi0x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
158 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
175 vo0p2 = vfmaq_lane_f32(vo0p2, vi1xACEG, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
181 vo0p2 = vfmaq_lane_f32(vo0p2, vi4xACEG, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
184 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3()
[all …]
D5x5p2-minmax-scalar-1x1-acc3.c113 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() local
123 vo0p2 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
126 vo0p2 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
136 vo0p2 += vi1x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
139 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
149 vo0p2 += vi2x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
159 vo0p2 += vi0x4 * vk04; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
162 vo0p2 += vi3x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
166 vo0p0 += vo0p2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
178 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() local
[all …]
D5x5s2p2-minmax-scalar-1x1-acc3.c125 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local
135 vo0p2 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
138 vo0p2 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
148 vo0p2 += vi1x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
151 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
161 vo0p2 += vi2x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
165 vo0p2 += vi0x4 * vk04; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
168 vo0p2 += vi3x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
172 vo0p0 += vo0p2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
189 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local
[all …]
D5x5s2p2-minmax-scalar-1x1-acc4.c125 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local
136 vo0p2 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
147 vo0p2 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
151 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
162 vo0p2 += vi3x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
167 vo0p2 += vi2x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
172 vo0p2 += vo0p3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
173 vo0p0 += vo0p2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
190 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local
195 vo0p2 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
[all …]
D5x5p2-minmax-scalar-1x1-acc4.c113 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() local
124 vo0p2 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
135 vo0p2 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
139 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
150 vo0p2 += vi3x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
161 vo0p2 += vi2x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
166 vo0p2 += vo0p3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
167 vo0p0 += vo0p2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
179 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() local
190 vo0p2 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-1x4-acc4.c91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() local
97vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4()
111vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4()
130vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4()
144vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4()
163vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4()
171vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4()
174 vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4()
175 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4()
200 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() local
[all …]
D5x5p2-minmax-sse-1x4-acc3.c104 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local
131 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
134 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
160 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
163 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
173 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
183 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
186 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
190 vo0p0 = _mm_add_ps(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
203 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local
[all …]
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c140 v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() local
144 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
150 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
173 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x68AC, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
217 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
223 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
245 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1xACEG, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
251 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4xACEG, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
254 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3()
[all …]
D5x5s2p2-minmax-neon-1x4-acc4.c96 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() local
102 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4()
110 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4()
129 vo0p2 = vmlaq_lane_f32(vo0p2, vi2x68AC, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4()
154 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4()
173 vo0p2 = vmlaq_lane_f32(vo0p2, vi0xACEG, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4()
181 vo0p2 = vmlaq_lane_f32(vo0p2, vi4xACEG, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4()
184 vo0p2 = vaddq_f32(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4()
185 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4()
213 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() local
[all …]
D5x5s2p2-minmax-neonfma-1x4-acc4.c96 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() local
102 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4()
110 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4()
129 vo0p2 = vfmaq_lane_f32(vo0p2, vi2x68AC, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4()
154 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4()
173 vo0p2 = vfmaq_lane_f32(vo0p2, vi0xACEG, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4()
181 vo0p2 = vfmaq_lane_f32(vo0p2, vi4xACEG, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4()
184 vo0p2 = vaddq_f32(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4()
185 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4()
213 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() local
[all …]
D5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc3.c120 v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() local
124vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
130vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
136vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
153vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
197vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
203vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
225vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
231vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
234 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc3.c120 v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() local
124vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
130vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
136vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
153vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
197vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
203vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
225vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
231vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
234 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c140 v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() local
144 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
150 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
173 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x68AC, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
217 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
223 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
245 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1xACEG, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
251 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4xACEG, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
254 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3()
[all …]

1234