/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 205 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 214 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 219 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 224 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 329 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 338 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 343 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 348 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 205 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 214 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 219 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 224 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 329 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 338 vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 343 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 348 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
|
D | 3x3p1-minmax-neon-6x4.c | 186 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 195 vo5p0 = vmlaq_lane_f32(vo5p0, vi5x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 201 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 207 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 319 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() local 328 vo5p0 = vmlaq_lane_f32(vo5p0, vi5x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 334 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4() 340 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_6x4()
|
D | 3x3p1-minmax-ssse3-6x4.c | 198 …const __m128 vi5x5678 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(vi5x89AB), _mm_castps_si… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 207 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 212 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 217 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 324 … const __m128 vi5x5678 = _mm_castsi128_ps(_mm_alignr_epi8(vzero, _mm_castps_si128(vi5x4567), 4)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 333 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 338 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 343 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
|
D | 3x3p1-minmax-neonfma-6x4.c | 186 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 195 vo5p0 = vfmaq_lane_f32(vo5p0, vi5x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 201 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 207 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 319 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() local 328 vo5p0 = vfmaq_lane_f32(vo5p0, vi5x5678, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 334 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4() 340 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_6x4()
|
D | 3x3p1-minmax-sse-6x4.c | 262 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 273 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 278 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 283 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 438 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() local 449 vo5p0 = _mm_add_ps(vo5p0, _mm_mul_ps(vi5x5678, vk02)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 454 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4() 459 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_6x4()
|
D | 5x5p2-minmax-sse-5x4.c | 298 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 312 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 316 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 320 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 324 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 334 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 551 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() local 565 vo4p0 = _mm_add_ps(vo4p0, _mm_mul_ps(vi5x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 569 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() 573 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_5x4() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 186 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 198 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 202 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 293 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 305 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk12)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 309 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk22)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
|
D | 3x3p1-minmax-neon-5x4.c | 168 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local 181 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 186 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 284 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() local 297 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4() 302 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_5x4()
|
D | 3x3p1-minmax-neonfma-5x4.c | 168 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local 181 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 186 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 284 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() local 297 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x5678, vget_high_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4() 302 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vw89, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_5x4()
|
D | 5x5p2-minmax-sse-3x4-acc2.c | 234 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 248 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 250 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 258 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 412 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 426 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 428 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 436 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() 573 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() local 587 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4_acc2() [all …]
|
D | 5x5p2-minmax-sse-3x4.c | 234 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 248 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 250 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 258 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 409 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 423 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 425 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 433 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 567 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 581 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() [all …]
|
D | 5x5p2-minmax-neon-4x4.c | 222 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 239 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 243 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 247 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 440 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 457 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 461 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 465 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 645 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 662 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() [all …]
|
D | 5x5p2-minmax-neon-4x4-acc2.c | 222 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 239 vo3p1 = vmlaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 243 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 247 vo1p1 = vmlaq_lane_f32(vo1p1, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 444 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 461 vo3p1 = vmlaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 465 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 469 vo1p1 = vmlaq_lane_f32(vo1p1, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() 653 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() local 670 vo3p1 = vmlaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4_acc2() [all …]
|
D | 5x5p2-minmax-neonfma-4x4.c | 222 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 239 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 243 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 247 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 440 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 457 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 461 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 465 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 645 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 662 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() [all …]
|
D | 5x5p2-minmax-neonfma-4x4-acc2.c | 222 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 239 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 243 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 247 vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 444 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 461 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 465 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 469 vo1p1 = vfmaq_lane_f32(vo1p1, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() 653 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vzero, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() local 670 vo3p1 = vfmaq_lane_f32(vo3p1, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4_acc2() [all …]
|
D | 5x5p2-minmax-sse-4x4.c | 266 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 281 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 284 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 287 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 296 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 480 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 495 vo3p0 = _mm_add_ps(vo3p0, _mm_mul_ps(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 498 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 501 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 510 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() [all …]
|
D | 5x5p2-minmax-sse-4x4-acc2.c | 266 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 281 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 284 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 287 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 296 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 484 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() local 499 vo3p1 = _mm_add_ps(vo3p1, _mm_mul_ps(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 502 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 505 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() 514 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4_acc2() [all …]
|
D | 3x3p1-minmax-wasmsimd-arm-splat-6x4.c | 189 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local 198 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 204 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 210 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 320 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() local 329 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 335 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4() 341 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_6x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-6x4.c | 189 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local 198 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 204 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 210 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 320 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() local 329 …vo5p0 = wasm_f32x4_add(vo5p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 335 …vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4() 341 …vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, wasm_v32x4_shuffle(vw89, vw89, 1, 1, 1, 1))… in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_6x4()
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 279 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 294 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 299 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 304 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 309 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 533 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 548 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 553 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 558 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 563 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-neonfma-5x4.c | 250 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 265 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 270 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 275 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 280 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 505 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 520 vo4p0 = vfmaq_lane_f32(vo4p0, vi5x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 525 vo3p0 = vfmaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 530 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 535 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() [all …]
|
D | 5x5p2-minmax-neon-5x4.c | 250 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 265 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 270 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 275 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 280 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 505 const float32x4_t vi5x5678 = vextq_f32(vi5x4567, vi5x89AB, 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 520 vo4p0 = vmlaq_lane_f32(vo4p0, vi5x5678, vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 525 vo3p0 = vmlaq_lane_f32(vo3p0, vi5x5678, vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 530 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x5678, vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 535 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x5678, vwOP, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-5x4.c | 279 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 294 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 299 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 304 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 309 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 533 const v128_t vi5x5678 = wasm_v32x4_shuffle(vi5x4567, vi5x89AB, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() local 548 vo4p0 = wasm_f32x4_add(vo4p0, wasm_f32x4_mul(vi5x5678, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 553 vo3p0 = wasm_f32x4_add(vo3p0, wasm_f32x4_mul(vi5x5678, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 558 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x5678, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() 563 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-sse-2x4.c | 202 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local 213 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 220 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 338 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local 349 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 356 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vi5x89AB, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 460 const __m128 vi5x5678 = _mm_shuffle_ps(vi5x8567, vi5x8567, _MM_SHUFFLE(0, 3, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local 471 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x5678, vk43)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 478 const __m128 vi5x6789 = _mm_shuffle_ps(vi5x5678, vzero, _MM_SHUFFLE(1, 0, 2, 1)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4()
|