/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neonfma-1x4-acc3.c | 88 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 92 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 104 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 110 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 127 vo0p2 = vfmaq_lane_f32(vo0p2, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 139 vo0p2 = vfmaq_lane_f32(vo0p2, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 145 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 162 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 168 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 171 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() [all …]
|
D | 5x5p2-minmax-neon-1x4-acc3.c | 88 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 92 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 104 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 110 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x3456, vget_high_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 127 vo0p2 = vmlaq_lane_f32(vo0p2, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 139 vo0p2 = vmlaq_lane_f32(vo0p2, vi0x5678, vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 145 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 162 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 168 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 171 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c | 117 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() local 121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 133 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 139 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 168 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 174 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 191 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 197 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() 200 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 117 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() local 121 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 133 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 139 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 168 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x5678, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 174 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 191 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 197 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() 200 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc3.c | 91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() local 95 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 107 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 113 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 130 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 142 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 148 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 165 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 171 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() 174 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4-acc3.c | 91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() local 95 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 107 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 113 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 130 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 142 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 148 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 165 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 171 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() 174 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc3() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-1x4-acc4.c | 117 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local 123 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 137 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 170 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 189 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 197 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 200 vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 201 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() 226 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_1x4_acc4() local [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc4.c | 88 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() local 94 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 108 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 127 vo0p2 = vfmaq_lane_f32(vo0p2, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 141 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 160 vo0p2 = vfmaq_lane_f32(vo0p2, vi0x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 168 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 171 vo0p2 = vaddq_f32(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 172 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() 198 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc4() local [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-1x4-acc4.c | 91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local 97 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 111 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 130 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 144 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 163 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 171 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 174 vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 175 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() 200 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_1x4_acc4() local [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-1x4-acc4.c | 117 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local 123 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 137 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 170 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 189 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 197 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 200 vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 201 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() 226 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_1x4_acc4() local [all …]
|
D | 5x5p2-minmax-neon-1x4-acc4.c | 88 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local 94 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 108 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 127 vo0p2 = vmlaq_lane_f32(vo0p2, vi2x2345, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 141 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 160 vo0p2 = vmlaq_lane_f32(vo0p2, vi0x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 168 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x6789, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 171 vo0p2 = vaddq_f32(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 172 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() 198 float32x4_t vo0p2 = vmulq_lane_f32(vi1x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc4() local [all …]
|
D | 5x5s2p2-minmax-neon-1x4-acc3.c | 96 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() local 100 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() 106 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() 112 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() 129 vo0p2 = vmlaq_lane_f32(vo0p2, vi2x68AC, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() 152 vo0p2 = vmlaq_lane_f32(vo0p2, vi0x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() 158 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() 175 vo0p2 = vmlaq_lane_f32(vo0p2, vi1xACEG, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() 181 vo0p2 = vmlaq_lane_f32(vo0p2, vi4xACEG, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() 184 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc3() [all …]
|
D | 5x5s2p2-minmax-neonfma-1x4-acc3.c | 96 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() local 100 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() 106 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() 112 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x8ACE9BDF.val[1], vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() 129 vo0p2 = vfmaq_lane_f32(vo0p2, vi2x68AC, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() 152 vo0p2 = vfmaq_lane_f32(vo0p2, vi0x79BD, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() 158 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x79BD, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() 175 vo0p2 = vfmaq_lane_f32(vo0p2, vi1xACEG, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() 181 vo0p2 = vfmaq_lane_f32(vo0p2, vi4xACEG, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() 184 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc3() [all …]
|
D | 5x5p2-minmax-scalar-1x1-acc3.c | 113 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() local 123 vo0p2 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 126 vo0p2 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 136 vo0p2 += vi1x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 139 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 149 vo0p2 += vi2x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 159 vo0p2 += vi0x4 * vk04; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 162 vo0p2 += vi3x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 166 vo0p0 += vo0p2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 178 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() local [all …]
|
D | 5x5s2p2-minmax-scalar-1x1-acc3.c | 125 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local 135 vo0p2 += vi0x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 138 vo0p2 += vi3x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 148 vo0p2 += vi1x2 * vk12; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 151 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 161 vo0p2 += vi2x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 165 vo0p2 += vi0x4 * vk04; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 168 vo0p2 += vi3x4 * vk34; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 172 vo0p0 += vo0p2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 189 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local [all …]
|
D | 5x5s2p2-minmax-scalar-1x1-acc4.c | 125 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local 136 vo0p2 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 147 vo0p2 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 151 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 162 vo0p2 += vi3x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 167 vo0p2 += vi2x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 172 vo0p2 += vo0p3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 173 vo0p0 += vo0p2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 190 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local 195 vo0p2 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() [all …]
|
D | 5x5p2-minmax-scalar-1x1-acc4.c | 113 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() local 124 vo0p2 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 135 vo0p2 += vi0x2 * vk02; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 139 vo0p2 += vi4x2 * vk42; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 150 vo0p2 += vi3x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 161 vo0p2 += vi2x4 * vk24; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 166 vo0p2 += vo0p3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 167 vo0p0 += vo0p2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 179 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() local 190 vo0p2 += vi1x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-1x4-acc4.c | 91 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() local 97 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() 111 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() 130 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() 144 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() 163 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() 171 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() 174 vo0p2 = wasm_f32x4_add(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() 175 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() 200 v128_t vo0p2 = wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_1x4_acc4() local [all …]
|
D | 5x5p2-minmax-sse-1x4-acc3.c | 104 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local 131 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 134 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 160 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi1x2345, vk10)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 163 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi4x2345, vk40)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 173 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi2x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 183 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi0x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 186 vo0p2 = _mm_add_ps(vo0p2, _mm_mul_ps(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 190 vo0p0 = _mm_add_ps(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 203 __m128 vo0p2 = _mm_mul_ps(vi2x4567, vk22); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-1x4-acc3.c | 140 v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() local 144 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() 150 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() 156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() 173 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x68AC, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() 217 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() 223 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() 245 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1xACEG, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() 251 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4xACEG, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() 254 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_1x4_acc3() [all …]
|
D | 5x5s2p2-minmax-neon-1x4-acc4.c | 96 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() local 102 vo0p2 = vmlaq_lane_f32(vo0p2, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() 110 vo0p2 = vmlaq_lane_f32(vo0p2, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() 129 vo0p2 = vmlaq_lane_f32(vo0p2, vi2x68AC, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() 154 vo0p2 = vmlaq_lane_f32(vo0p2, vi1x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() 173 vo0p2 = vmlaq_lane_f32(vo0p2, vi0xACEG, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() 181 vo0p2 = vmlaq_lane_f32(vo0p2, vi4xACEG, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() 184 vo0p2 = vaddq_f32(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() 185 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() 213 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4_acc4() local [all …]
|
D | 5x5s2p2-minmax-neonfma-1x4-acc4.c | 96 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() local 102 vo0p2 = vfmaq_lane_f32(vo0p2, vi4x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() 110 vo0p2 = vfmaq_lane_f32(vo0p2, vi3x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() 129 vo0p2 = vfmaq_lane_f32(vo0p2, vi2x68AC, vget_high_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() 154 vo0p2 = vfmaq_lane_f32(vo0p2, vi1x79BD, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() 173 vo0p2 = vfmaq_lane_f32(vo0p2, vi0xACEG, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() 181 vo0p2 = vfmaq_lane_f32(vo0p2, vi4xACEG, vwOP, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() 184 vo0p2 = vaddq_f32(vo0p2, vo0p3); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() 185 vo0p0 = vaddq_f32(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() 213 float32x4_t vo0p2 = vmulq_lane_f32(vi1x8ACE, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc4() local [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-1x4-acc3.c | 120 v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() local 124 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() 130 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() 136 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() 153 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() 197 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() 203 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() 225 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() 231 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() 234 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc3() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-1x4-acc3.c | 120 v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() local 124 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() 130 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() 136 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() 153 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x68AC, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() 197 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x79BD, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() 203 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x79BD, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() 225 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1xACEG, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() 231 …vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4xACEG, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() 234 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc3() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-1x4-acc3.c | 140 v128_t vo0p2 = wasm_f32x4_mul(vi1x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() local 144 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() 150 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() 156 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4x9BDF, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() 173 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi2x68AC, vk20)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() 217 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi0x79BD, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() 223 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi3x79BD, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() 245 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi1xACEG, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() 251 vo0p2 = wasm_f32x4_add(vo0p2, wasm_f32x4_mul(vi4xACEG, vk44)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() 254 vo0p0 = wasm_f32x4_add(vo0p0, vo0p2); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_1x4_acc3() [all …]
|