/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-scalar-6x1.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local 116 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 117 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 118 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 119 float vo3p0 = vbias + vi3x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 120 float vo4p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 121 float vo5p0 = vbias + vi5x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 216 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 217 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 218 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() [all …]
|
D | 3x3p1-minmax-scalar-5x1.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 107 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 108 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 109 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 110 float vo3p0 = vbias + vi3x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 111 float vo4p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 193 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 194 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 195 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 196 float vo3p0 = vbias + vi3x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() [all …]
|
D | 3x3p1-minmax-scalar-4x1.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 98 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 99 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 100 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 101 float vo3p0 = vbias + vi3x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 170 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 171 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 172 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 173 float vo3p0 = vbias + vi3x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3s2p1-minmax-scalar-4x1.c | 36 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 116 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 117 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 118 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 119 float vo3p0 = vbias + vi6x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 213 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 214 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 215 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 216 float vo3p0 = vbias + vi6x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1()
|
D | 3x3p1-minmax-scalar-3x1.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 89 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 90 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 91 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 147 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 148 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 149 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1()
|
D | 3x3s2p1-minmax-scalar-3x1.c | 36 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() local 102 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 103 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 104 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 179 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 180 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 181 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1()
|
D | 3x3p1-minmax-scalar-2x1.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() local 80 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 81 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 124 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 125 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1()
|
D | 3x3p1-minmax-scalar-2x1-acc2.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() local 80 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 81 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 126 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 127 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2()
|
D | 3x3s2p1-minmax-scalar-2x1.c | 36 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() local 88 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 89 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 145 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1() 146 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1()
|
D | 3x3s2p1-minmax-scalar-2x1-acc2.c | 36 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() local 88 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 89 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 147 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2() 148 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2()
|
D | 5x5p2-minmax-scalar-3x1.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 133 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 134 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 135 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 260 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 261 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 262 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 365 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 366 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 367 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 133 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 134 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 135 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 263 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 264 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 265 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 371 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 372 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 373 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 36 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 168 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 169 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 170 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 302 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 303 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 304 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 379 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 380 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 381 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 36 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 168 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 169 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 170 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 305 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 306 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 307 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 385 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 386 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 387 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5p2-minmax-scalar-2x1.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 122 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 123 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 217 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 218 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 296 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 297 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 43 const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 157 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 158 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 159 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 160 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 161 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 272 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 273 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 274 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 275 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 43 const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 172 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 173 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 174 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 175 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 176 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 177 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 305 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 306 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 307 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 43 const v128_t vk00 = wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 172 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 173 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 174 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 175 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 176 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 177 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 305 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi0x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 306 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 307 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() [all …]
|
D | 3x3p1-minmax-ssse3-6x4.c | 38 const __m128 vk00 = _mm_load1_ps(weights + 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 165 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 166 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 167 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 168 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi3x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 169 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 170 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 299 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi0x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 300 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 301 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk00)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() [all …]
|
D | 3x3p1-minmax-scalar-1x1-acc3.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() local 71 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 103 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3()
|
D | 3x3p1-minmax-scalar-1x1.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() local 71 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 101 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1()
|
D | 3x3p1-minmax-scalar-1x1-acc2.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() local 71 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 102 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2()
|
D | 3x3p1-minmax-scalar-1x1-acc4.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4() local 71 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4() 104 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-2x1.c | 36 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 146 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 147 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 244 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 245 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 298 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 299 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 35 const float vk00 = weights[1]; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 122 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 123 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 221 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 222 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 304 float vo0p0 = vbias + vi0x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 305 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|