/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neon-4x4.c | 257 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 271 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 275 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 279 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 283 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 475 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 489 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 493 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 497 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 501 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() [all …]
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 257 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 271 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 275 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 279 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 283 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 479 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 493 vo3p0 = vmlaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 497 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 501 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 505 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-4x4.c | 257 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 271 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 275 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 279 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 283 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 475 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 489 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 493 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 497 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 501 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() [all …]
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 257 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 271 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 275 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 279 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 283 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 479 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 493 vo3p0 = vfmaq_lane_f32(vo3p0, vi3x6789, vget_low_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 497 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 501 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 505 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc2.c | 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 166 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 263 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 274 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 361 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 370 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
|
D | 5x5p2-minmax-neonfma-1x4-acc3.c | 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 166 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 264 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 275 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 363 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 372 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
|
D | 5x5p2-minmax-neon-1x4.c | 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 166 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 262 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 273 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 359 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 368 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
|
D | 5x5p2-minmax-neonfma-1x4.c | 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 166 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 262 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 273 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 359 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 368 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
|
D | 5x5p2-minmax-neon-1x4-acc3.c | 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 166 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 264 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 275 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 363 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 372 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
|
D | 5x5p2-minmax-neon-1x4-acc2.c | 155 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 166 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 263 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 274 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 361 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 370 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
|
D | 5x5p2-minmax-neon-2x4-acc2.c | 189 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local 203 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 205 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 335 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local 349 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 351 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 469 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local 480 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 482 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
|
D | 5x5p2-minmax-neonfma-2x4.c | 189 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local 203 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 205 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 333 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local 347 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 349 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 465 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local 476 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 478 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
|
D | 5x5p2-minmax-neon-2x4.c | 189 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local 203 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 205 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 333 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local 347 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 349 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 465 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local 476 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 478 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
|
D | 5x5p2-minmax-neonfma-2x4-acc2.c | 189 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local 203 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 205 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 335 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local 349 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 351 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 469 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local 480 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 482 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
|
D | 5x5p2-minmax-neonfma-3x4.c | 223 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 238 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 241 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 244 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 404 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 419 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 422 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 425 vo0p0 = vfmaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 571 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 582 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 252 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 267 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 270 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 273 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 432 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 447 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 450 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 453 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 597 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 608 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 252 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 267 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 270 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 273 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 435 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 450 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 453 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 456 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() 603 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() local 614 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 252 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 267 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 270 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 273 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 432 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 447 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 450 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 453 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 597 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x5678, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 608 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() [all …]
|
D | 5x5p2-minmax-neon-3x4-acc2.c | 223 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 238 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 241 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 244 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 407 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 422 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 425 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 428 vo0p1 = vmlaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() 577 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() local 588 vo2p1 = vmlaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4_acc2() [all …]
|
D | 5x5p2-minmax-neon-3x4.c | 223 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 238 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 241 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 244 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 404 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 419 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 422 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 425 vo0p0 = vmlaq_lane_f32(vo0p0, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 571 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 582 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() [all …]
|
D | 5x5p2-minmax-neonfma-3x4-acc2.c | 223 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 238 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 241 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 244 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 407 const float32x4_t vi3x6789 = vextq_f32(vi3x4567, vi3x89AB, 2); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 422 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 425 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x6789, vget_high_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 428 vo0p1 = vfmaq_lane_f32(vo0p1, vi3x6789, vget_low_f32(vwKLMN), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() 577 const float32x4_t vi3x6789 = vextq_f32(vi3x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() local 588 vo2p1 = vfmaq_lane_f32(vo2p1, vi3x6789, vget_high_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4-acc2.c | 286 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 300 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 304 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 308 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 312 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 507 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() local 521 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 525 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 529 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() 533 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 286 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 300 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 304 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 308 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 312 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 503 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 517 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 521 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 525 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 529 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4-acc2.c | 286 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 300 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 304 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 308 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 312 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 507 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() local 521 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 525 vo2p1 = wasm_f32x4_add(vo2p1, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 529 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() 533 vo0p1 = wasm_f32x4_add(vo0p1, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4_acc2() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 286 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 300 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 304 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 308 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 312 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 503 const v128_t vi3x6789 = wasm_v32x4_shuffle(vi3x4567, vi3x89AB, 2, 3, 4, 5); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 517 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi3x6789, vk04)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 521 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x6789, vk14)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 525 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x6789, vk24)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 529 vo0p0 = wasm_f32x4_add(vo0p0, wasm_f32x4_mul(vi3x6789, vk34)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() [all …]
|