/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-scalar-3x1.c | 47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 187 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 188 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 189 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 314 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 315 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 316 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 403 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 404 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 405 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 187 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 188 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 189 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 317 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 318 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 319 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 409 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 410 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 411 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 226 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 227 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 228 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 340 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 341 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 342 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 417 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 418 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 419 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 226 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 227 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 228 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 343 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 344 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 345 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 423 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 424 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 425 vo2p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 3x3p1-minmax-scalar-5x1.c | 43 const float vk22 = weights[9]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 165 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 166 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 167 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 168 vo3p0 += vi5x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 169 vo4p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
|
D | 5x5p2-minmax-scalar-2x1.c | 47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 162 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 163 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 257 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 258 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 322 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 323 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 3x3p1-minmax-scalar-6x1.c | 43 const float vk22 = weights[9]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local 184 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 185 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 186 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 187 vo3p0 += vi5x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 188 vo4p0 += vi6x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 189 vo5p0 += vi7x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 199 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 200 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 202 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 203 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 306 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 307 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 308 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 309 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 221 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 222 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 224 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 225 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 226 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 345 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 346 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 347 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 221 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 222 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 224 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 225 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 226 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi7x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 345 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 346 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 347 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-ssse3-6x4.c | 46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 214 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 215 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 216 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 217 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 218 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 219 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 340 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 341 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 342 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() [all …]
|
D | 5x5s2p2-minmax-scalar-2x1.c | 48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 188 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 189 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 270 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 271 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 324 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 325 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 162 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 163 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 261 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 262 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 330 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 331 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc2.c | 47 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local 162 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 163 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 259 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 260 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 326 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 327 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1-acc2.c | 48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local 188 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 189 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 272 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 273 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 328 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 329 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1-acc3.c | 48 const float vk22 = weights[13]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local 188 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 189 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 274 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 275 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 332 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 333 vo1p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
|
D | 3x3p1-minmax-sse-6x4.c | 46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 280 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 281 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 282 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 283 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 284 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 285 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi7x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 456 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 457 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 458 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c | 51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 199 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 200 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 202 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 203 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 306 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 307 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 308 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 309 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 3x3p1-minmax-ssse3-5x4.c | 46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local 192 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 193 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 194 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 195 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 196 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 301 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 302 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 303 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 304 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() [all …]
|
D | 3x3p1-minmax-sse-5x4.c | 46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local 250 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 251 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 252 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 253 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 254 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi6x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 403 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 404 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 405 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 406 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() [all …]
|
D | 3x3p1-minmax-scalar-4x1.c | 43 const float vk22 = weights[9]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 146 vo0p0 += vi2x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 147 vo1p0 += vi3x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 148 vo2p0 += vi4x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 149 vo3p0 += vi5x2 * vk22; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 177 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 178 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 179 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 180 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 267 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 268 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 269 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 270 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
|
D | 3x3p1-minmax-ssse3-4x4.c | 46 const __m128 vk22 = _mm_load1_ps(weights + 9); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 170 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 171 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 172 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 173 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 262 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 263 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 264 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 265 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 177 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 178 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 179 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 180 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 267 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 268 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 269 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 270 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-3x4.c | 51 const v128_t vk22 = wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() local 155 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 156 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 228 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi2x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 229 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 230 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
|