/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-sse-1x4-acc4.c | 169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() local 175 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() 181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() 269 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() local 275 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() 281 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() 358 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() local 364 vo0p3 = _mm_add_ps(vo0p3, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4() 370 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc4()
|
D | 5x5p2-minmax-sse-1x4.c | 169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() local 175 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() 181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() 266 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() local 272 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() 278 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() 352 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() local 358 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4() 364 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4()
|
D | 5x5p2-minmax-sse-1x4-acc2.c | 169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() local 175 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 267 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() local 273 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 279 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 354 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() local 360 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2() 366 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc2()
|
D | 5x5p2-minmax-sse-1x4-acc3.c | 169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local 175 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 268 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local 274 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 280 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 356 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() local 362 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3() 368 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc3()
|
D | 5x5p2-minmax-sse-1x4-acc5.c | 169 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() local 175 vo0p4 = _mm_add_ps(vo0p4, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() 181 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() 270 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() local 276 vo0p4 = _mm_add_ps(vo0p4, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() 282 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() 360 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() local 366 vo0p4 = _mm_add_ps(vo0p4, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5() 372 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_1x4_acc5()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 185 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 193 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 292 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 300 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 304 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 308 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
|
D | 3x3p1-minmax-neon-5x4.c | 167 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local 175 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 180 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 185 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 283 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local 291 vo4p0 = vmlaq_lane_f32(vo4p0, vi4x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 296 vo3p0 = vmlaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 301 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
|
D | 3x3p1-minmax-neonfma-5x4.c | 167 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local 175 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 180 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 185 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 283 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local 291 vo4p0 = vfmaq_lane_f32(vo4p0, vi4x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 296 vo3p0 = vfmaq_lane_f32(vo3p0, vi4x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 301 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
|
D | 5x5p2-minmax-sse-2x4.c | 201 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local 211 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 212 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 219 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 337 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local 347 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 348 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 355 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 459 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local 469 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() [all …]
|
D | 5x5p2-minmax-neonfma-1x4-acc2.c | 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 147 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 245 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 255 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 346 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() local 356 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2() 362 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc2()
|
D | 5x5p2-minmax-neonfma-1x4-acc3.c | 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 147 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 246 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 256 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 348 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() local 358 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3() 364 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4_acc3()
|
D | 5x5p2-minmax-neon-1x4.c | 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 147 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 244 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 254 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 344 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() local 354 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4() 360 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4()
|
D | 5x5p2-minmax-neonfma-1x4.c | 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 147 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 244 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 254 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 344 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() local 354 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4() 360 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_1x4()
|
D | 5x5p2-minmax-neon-1x4-acc3.c | 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 147 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 246 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 256 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 348 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() local 358 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3() 364 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc3()
|
D | 5x5p2-minmax-neon-1x4-acc2.c | 137 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 147 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 245 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 255 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 346 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() local 356 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2() 362 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4_acc2()
|
D | 5x5p2-minmax-sse-3x4-acc2.c | 233 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 245 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 247 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 249 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 257 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 411 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 423 vo2p1 = _mm_add_ps(vo2p1, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 425 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 427 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 435 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() [all …]
|
D | 5x5p2-minmax-sse-3x4.c | 233 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 245 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 247 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 249 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 257 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 408 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 420 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 422 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 424 vo0p0 = _mm_add_ps(vo0p0, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 432 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c | 185 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vi4x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 193 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 197 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 201 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 292 const v128_t vi4x5678 = wasm_v32x4_shuffle(vi4x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 300 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 304 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 308 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
|
D | 3x3p1-minmax-ssse3-5x4.c | 178 …const __m128 vi4x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi4x89AB), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local 186 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 190 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 194 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 287 … const __m128 vi4x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi4x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local 295 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 299 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 303 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
|
D | 3x3p1-minmax-sse-5x4.c | 234 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local 244 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 248 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 252 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 387 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local 397 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi4x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 401 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi4x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 405 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
|
D | 5x5p2-minmax-neon-2x4-acc2.c | 165 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local 178 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 180 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 311 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local 324 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 326 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 448 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() local 461 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 463 vo0p1 = vmlaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2() 470 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4_acc2()
|
D | 5x5p2-minmax-neonfma-2x4.c | 165 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local 178 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 180 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 309 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local 322 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 324 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 444 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local 457 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 459 vo0p0 = vfmaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 466 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4()
|
D | 5x5p2-minmax-neon-2x4.c | 165 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local 178 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 180 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 309 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local 322 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 324 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 444 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local 457 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 459 vo0p0 = vmlaq_lane_f32(vo0p0, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 466 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4()
|
D | 5x5p2-minmax-neonfma-2x4-acc2.c | 165 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local 178 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 180 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 311 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vi4x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local 324 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 326 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 448 const float32x4_t vi4x5678 = vextq_f32(vi4x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() local 461 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 463 vo0p1 = vfmaq_lane_f32(vo0p1, vi4x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2() 470 const float32x4_t vi4x6789 = vextq_f32(vi4x5678, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4_acc2()
|
D | 5x5p2-minmax-sse-2x4-acc2.c | 201 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() local 211 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() 212 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() 219 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() 339 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() local 349 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() 350 vo0p1 = _mm_add_ps(vo0p1, _mm_mul_ps(vi4x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() 357 const __m128 vi4x6789 = _mm_shuffle_ps(vi4x5678, vi4x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() 463 const __m128 vi4x5678 = _mm_shuffle_ps(vi4x8567, vi4x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() local 473 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4_acc2() [all …]
|