/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-scalar-1x1-acc4.c | 102 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() local 145 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 151 vo0p3 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 157 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 211 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4() 217 vo0p3 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc4()
|
D | 5x5p2-minmax-scalar-1x1-acc2.c | 102 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() local 145 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 151 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 157 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 209 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2() 215 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc2()
|
D | 5x5p2-minmax-scalar-1x1-acc3.c | 102 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() local 145 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 151 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 157 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 210 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3() 216 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc3()
|
D | 5x5p2-minmax-scalar-1x1.c | 102 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() local 145 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 151 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 157 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 208 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1() 214 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1()
|
D | 5x5p2-minmax-scalar-2x1.c | 111 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 173 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 183 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 184 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 191 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 268 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 278 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 279 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1()
|
D | 5x5p2-minmax-scalar-3x1.c | 120 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 201 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 213 vo2p0 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 215 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 217 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 225 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 328 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 340 vo2p0 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 342 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 344 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 5x5p2-minmax-scalar-1x1-acc5.c | 102 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() local 145 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 151 vo0p4 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 157 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 212 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5() 218 vo0p4 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5()
|
D | 5x5p2-minmax-scalar-2x1-acc3.c | 111 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() local 173 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 183 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 184 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 191 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 272 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 282 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3() 283 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc3()
|
D | 5x5p2-minmax-scalar-2x1-acc2.c | 111 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() local 173 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 183 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 184 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 191 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 270 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 280 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2() 281 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 120 float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 201 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 213 vo2p1 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 215 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 217 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 225 vi4x3 = vi4x4; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 331 vi4x2 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 343 vo2p1 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 345 vo1p0 += vi4x3 * vk33; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 347 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-1x1-acc4.c | 110 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local 145 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 163 vo0p3 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() 186 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4() local 210 vo0p3 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc4()
|
D | 5x5s2p2-minmax-scalar-1x1.c | 110 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() local 145 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() 163 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() 183 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1() local 207 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1()
|
D | 5x5s2p2-minmax-scalar-1x1-acc2.c | 110 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() local 145 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 163 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() 184 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2() local 208 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc2()
|
D | 5x5s2p2-minmax-scalar-1x1-acc3.c | 110 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local 145 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 163 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() 185 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3() local 209 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc3()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 143 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 214 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 248 vo2p0 += vi4x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 253 vo1p0 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 258 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 296 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 352 vo2p0 += vi4x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 357 vo1p0 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 362 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 143 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 214 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 248 vo2p1 += vi4x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 253 vo1p1 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 258 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 299 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 355 vo2p1 += vi4x3 * vk03; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 360 vo1p1 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 365 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1.c | 127 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 180 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 208 vo1p0 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 211 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 240 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 282 vo1p0 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 285 vo0p0 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1()
|
D | 5x5s2p2-minmax-scalar-2x1-acc2.c | 127 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local 180 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 208 vo1p1 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 211 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 242 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() local 284 vo1p1 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2() 287 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2()
|
D | 5x5s2p2-minmax-scalar-2x1-acc3.c | 127 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local 180 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 208 vo1p2 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 211 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 244 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() local 286 vo1p2 += vi4x3 * vk23; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3() 289 vo0p1 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc3()
|
D | 5x5s2p2-minmax-scalar-1x1-acc5.c | 110 const float vi4x3 = i4[0]; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() local 145 vi4x1 = vi4x3; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() 163 vo0p4 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() 187 const float vi4x3 = *i4++; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5() local 211 vo0p4 += vi4x3 * vk43; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5()
|
/external/XNNPACK/src/f32-conv-hwc2chw/ |
D | 3x3s2p1c3x4-wasmsimd-2x2.c | 236 const v128_t vi4x3 = wasm_v128_load(i4); i4 += 4; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 255 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi4x3, vi4x3, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 276 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk22c0, wasm_v32x4_shuffle(vi4x3, vi4x3, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 297 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk22c1, wasm_v32x4_shuffle(vi4x3, vi4x3, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 318 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk22c2, wasm_v32x4_shuffle(vi4x3, vi4x3, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 324 vi4x0 = vi4x3; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 508 v128_t vi4x3 = vzero; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 515 vi4x3 = wasm_v32x4_load_splat(i4 + 8); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 535 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi4x3, vi4x3, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2()
|
D | 3x3s2p1c3x4-sse-2x2.c | 235 const __m128 vi4x3 = _mm_loadu_ps(i4); i4 += 4; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() local 254 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 275 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c0, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 296 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c1, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 317 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk22c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 323 vi4x0 = vi4x3; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 508 __m128 vi4x3 = _mm_setzero_ps(); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() local 515 vi4x3 = _mm_load_ss(i4 + 8); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 535 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi4x3, vi4x3, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
|
D | 3x3s2p1c3x4-neonfma-2x2.c | 235 const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() local 254 vo1x1 = vfmaq_laneq_f32(vo1x1, vk21c2, vi4x3, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 275 vo1x1 = vfmaq_laneq_f32(vo1x1, vk22c0, vi4x3, 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 296 vo1x1 = vfmaq_laneq_f32(vo1x1, vk22c1, vi4x3, 2); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 317 vo1x1 = vfmaq_laneq_f32(vo1x1, vk22c2, vi4x3, 3); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 323 vi4x0 = vi4x3; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 508 float32x4_t vi4x3 = vmovq_n_f32(0.0f); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() local 515 vi4x3 = vld1q_lane_f32(i4 + 8, vi4x3, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 535 vo1x1 = vfmaq_laneq_f32(vo1x1, vk21c2, vi4x3, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2()
|
D | 3x3s2p1c3x4-neon-2x2.c | 235 const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() local 254 vo1x1 = vmlaq_lane_f32(vo1x1, vk21c2, vget_low_f32(vi4x3), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 275 vo1x1 = vmlaq_lane_f32(vo1x1, vk22c0, vget_low_f32(vi4x3), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 296 vo1x1 = vmlaq_lane_f32(vo1x1, vk22c1, vget_high_f32(vi4x3), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 317 vo1x1 = vmlaq_lane_f32(vo1x1, vk22c2, vget_high_f32(vi4x3), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 323 vi4x0 = vi4x3; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 506 float32x4_t vi4x3 = vmovq_n_f32(0.0f); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() local 513 vi4x3 = vld1q_lane_f32(i4 + 8, vi4x3, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 533 vo1x1 = vmlaq_lane_f32(vo1x1, vk21c2, vget_low_f32(vi4x3), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2()
|
/external/XNNPACK/src/f32-conv-hwc/gen/ |
D | 3x3s2p1c3x4-neon-2x2.c | 237 const float32x4_t vi4x3 = vld1q_f32(i4); i4 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() local 259 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk21c2x0123, vget_low_f32(vi4x3), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 283 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c0x0123, vget_low_f32(vi4x3), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 307 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c1x0123, vget_high_f32(vi4x3), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 331 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk22c2x0123, vget_high_f32(vi4x3), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 337 vi4x0 = vi4x3; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2()
|