/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-scalar-6x1.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local 128 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 129 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 130 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 131 vo3p0 += vi5x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 132 vo4p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 133 vo5p0 += vi7x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 228 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 229 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 230 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() [all …]
|
D | 3x3p1-minmax-scalar-5x1.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 117 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 118 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 119 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 120 vo3p0 += vi5x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 121 vo4p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 203 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 204 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 205 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 206 vo3p0 += vi5x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() [all …]
|
D | 3x3p1-minmax-scalar-4x1.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 106 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 107 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 108 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 109 vo3p0 += vi5x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 178 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 179 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 180 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 181 vo3p0 += vi5x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3s2p1-minmax-scalar-4x1.c | 42 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 124 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 125 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 126 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 127 vo3p0 += vi8x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 221 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 222 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 223 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 224 vo3p0 += vi8x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1()
|
D | 3x3p1-minmax-scalar-3x1.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 95 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 96 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 97 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 153 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 154 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 155 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1()
|
D | 3x3s2p1-minmax-scalar-3x1.c | 42 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() local 108 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 109 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 110 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 185 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 186 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 187 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1()
|
D | 3x3p1-minmax-scalar-2x1.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() local 84 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 85 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 128 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 129 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1()
|
D | 3x3p1-minmax-scalar-2x1-acc2.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() local 84 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 85 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 130 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 131 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2()
|
D | 3x3s2p1-minmax-scalar-2x1.c | 42 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() local 92 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 93 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 149 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 150 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1()
|
D | 3x3s2p1-minmax-scalar-2x1-acc2.c | 42 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() local 92 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 93 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 151 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 152 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2()
|
D | 5x5p2-minmax-scalar-3x1.c | 45 const float vk20 = weights[11]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 139 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 140 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 141 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 266 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 267 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 268 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 371 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 372 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 373 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 45 const float vk20 = weights[11]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 139 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 140 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 141 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 269 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 270 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 271 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 377 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 378 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 379 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 46 const float vk20 = weights[11]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 174 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 175 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 176 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 308 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 309 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 310 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 385 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 386 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 387 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 46 const float vk20 = weights[11]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 174 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 175 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 176 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 311 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 312 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 313 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 391 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 392 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 393 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-2x1.c | 45 const float vk20 = weights[11]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 126 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 127 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 221 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 222 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 300 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 301 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 49 const v128_t vk20 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 167 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 168 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 170 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 171 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 282 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 283 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 284 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 285 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 49 const v128_t vk20 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 184 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 185 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 187 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 188 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 189 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 49 const v128_t vk20 = wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 184 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 185 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 187 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 188 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 189 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 317 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 318 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 319 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-ssse3-6x4.c | 44 const __m128 vk20 = _mm_load1_ps(weights + 7); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 177 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 178 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 179 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 180 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 181 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 182 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 311 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 312 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 313 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk20)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() [all …]
|
D | 3x3p1-minmax-scalar-1x1-acc3.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() local 73 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 105 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3()
|
D | 3x3p1-minmax-scalar-1x1.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() local 73 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 103 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1()
|
D | 3x3p1-minmax-scalar-1x1-acc2.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() local 73 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 104 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2()
|
D | 3x3p1-minmax-scalar-1x1-acc4.c | 41 const float vk20 = weights[7]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4() local 73 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4() 106 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-2x1.c | 46 const float vk20 = weights[11]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 150 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 151 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 248 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 249 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 302 vo0p0 += vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 303 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 45 const float vk20 = weights[11]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 126 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 127 float vo1p2 = vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 225 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 226 float vo1p2 = vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 308 float vo0p2 = vi2x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 309 float vo1p2 = vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|