/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-wasmsimd-arm-splat-6x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local 138 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 139 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 140 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 141 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 142 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 143 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 207 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 208 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 209 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-splat-6x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local 138 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 139 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 140 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 141 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 142 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 143 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 207 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 208 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 209 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-splat-5x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() local 126 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 127 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 128 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 129 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 130 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 186 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 187 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 188 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 189 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-splat-5x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() local 126 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 127 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 128 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 129 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 130 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 186 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 187 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 188 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 189 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-splat-4x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() local 114 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 115 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 116 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 117 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 165 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 166 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 167 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 168 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-splat-4x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() local 114 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 115 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 116 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 117 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 165 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 166 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 167 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 168 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-splat-3x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() local 102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 104 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 146 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 190 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-splat-3x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() local 102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 104 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 146 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 190 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() [all …]
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-4x4.c | 42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() local 169 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 170 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 171 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 172 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 219 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 220 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 221 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi8x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 294 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() [all …]
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-4x4.c | 42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() local 169 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 170 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 171 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 172 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 219 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 220 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 221 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi8x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 294 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() [all …]
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-3x4.c | 42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local 144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 146 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 183 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 184 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 185 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 244 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 245 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 246 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-splat-2x4-acc2.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() local 90 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 91 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 123 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 124 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 160 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 161 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2() 189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-2x4-acc2.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() local 90 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 91 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 123 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 124 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 160 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 161 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 188 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2() 189 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4_acc2()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-2x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() local 90 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 91 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 123 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 124 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 158 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 159 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 186 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4() 187 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_2x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-2x4.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() local 90 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 91 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 123 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 124 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 158 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 159 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 186 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4() 187 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_2x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-3x4.c | 42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() local 144 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 146 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 183 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 184 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 185 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 244 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 245 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 246 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() [all …]
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-2x4-acc2.c | 42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() local 119 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() 120 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() 148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() 149 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() 196 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() 197 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() 220 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2() 221 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4_acc2()
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-2x4.c | 42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() local 119 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() 120 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() 148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() 149 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() 194 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() 195 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() 218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4() 219 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_2x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-2x4.c | 42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() local 119 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() 120 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() 148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() 149 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() 194 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() 195 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() 218 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4() 219 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-2x4-acc2.c | 42 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() local 119 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() 120 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() 148 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() 149 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() 196 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() 197 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() 220 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2() 221 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_2x4_acc2()
|
D | 3x3p1-minmax-neon-6x4.c | 39 const float32x2_t vw89 = vld1_f32(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 135 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 136 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 137 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 138 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 139 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 140 vo5p0 = vmlaq_lane_f32(vo5p0, vi7x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 204 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 205 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 206 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() [all …]
|
D | 3x3p1-minmax-neonfma-6x4.c | 39 const float32x2_t vw89 = vld1_f32(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 135 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 136 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 137 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 138 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 139 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 140 vo5p0 = vfmaq_lane_f32(vo5p0, vi7x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 204 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 205 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 206 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() [all …]
|
D | 3x3p1-minmax-neon-5x4.c | 39 const float32x2_t vw89 = vld1_f32(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local 123 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 124 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 125 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 126 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 127 vo4p0 = vmlaq_lane_f32(vo4p0, vi6x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 183 vo0p0 = vmlaq_lane_f32(vo0p0, vi2x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 184 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 185 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 186 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() [all …]
|
D | 3x3p1-minmax-neonfma-5x4.c | 39 const float32x2_t vw89 = vld1_f32(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local 123 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 124 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 125 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 126 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 127 vo4p0 = vfmaq_lane_f32(vo4p0, vi6x4567, vw89, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 183 vo0p0 = vfmaq_lane_f32(vo0p0, vi2x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 184 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 185 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 186 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-splat-1x4-acc2.c | 40 const v128_t vw89 = wasm_v64x2_load_splat(weights + 8); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() local 78 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 102 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 129 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89, vw89, 0, 0, 0, 0))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2() 149 …vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_1x4_acc2()
|