/external/XNNPACK/src/f32-conv-hwc2chw/ |
D | 3x3s2p1c3x4-neonfma-2x2.c | 221 const float32x4_t vk21c1 = vld1q_f32(w + 60); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() local 223 vo0x0 = vfmaq_laneq_f32(vo0x0, vk21c1, vi2x1, 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 224 vo1x0 = vfmaq_laneq_f32(vo1x0, vk21c1, vi4x1, 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 225 vo0x1 = vfmaq_laneq_f32(vo0x1, vk21c1, vi2x2, 3); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 226 vo1x1 = vfmaq_laneq_f32(vo1x1, vk21c1, vi4x2, 3); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 493 const float32x4_t vk21c1 = vld1q_f32(w + 60); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() local 495 vo0x0 = vfmaq_laneq_f32(vo0x0, vk21c1, vi2x1, 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 496 vo1x0 = vfmaq_laneq_f32(vo1x0, vk21c1, vi4x1, 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 498 vo0x1 = vfmaq_laneq_f32(vo0x1, vk21c1, vi2x2, 3); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 499 vo1x1 = vfmaq_laneq_f32(vo1x1, vk21c1, vi4x2, 3); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2()
|
D | 3x3s2p1c3x4-neon-2x2.c | 221 const float32x4_t vk21c1 = vld1q_f32(w + 60); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() local 223 vo0x0 = vmlaq_lane_f32(vo0x0, vk21c1, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 224 vo1x0 = vmlaq_lane_f32(vo1x0, vk21c1, vget_low_f32(vi4x1), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 225 vo0x1 = vmlaq_lane_f32(vo0x1, vk21c1, vget_high_f32(vi2x2), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 226 vo1x1 = vmlaq_lane_f32(vo1x1, vk21c1, vget_high_f32(vi4x2), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 491 const float32x4_t vk21c1 = vld1q_f32(w + 60); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() local 493 vo0x0 = vmlaq_lane_f32(vo0x0, vk21c1, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 494 vo1x0 = vmlaq_lane_f32(vo1x0, vk21c1, vget_low_f32(vi4x1), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 496 vo0x1 = vmlaq_lane_f32(vo0x1, vk21c1, vget_high_f32(vi2x2), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 497 vo1x1 = vmlaq_lane_f32(vo1x1, vk21c1, vget_high_f32(vi4x2), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2()
|
D | 3x3s2p1c3x4-wasmsimd-2x2.c | 222 const v128_t vk21c1 = wasm_v128_load(w + 60); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 224 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x1, vi2x1, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 225 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi4x1, vi4x1, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 226 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 227 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi4x2, vi4x2, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 493 const v128_t vk21c1 = wasm_v128_load(w + 60); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 495 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x1, vi2x1, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 496 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi4x1, vi4x1, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 498 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 499 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi4x2, vi4x2, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2()
|
D | 3x3s2p1c3x4-sse-2x2.c | 221 const __m128 vk21c1 = _mm_load_ps(w + 60); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() local 223 …vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 224 …vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 225 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 226 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 493 const __m128 vk21c1 = _mm_load_ps(w + 60); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() local 495 …vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 496 …vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x1, vi4x1, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 498 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 499 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi4x2, vi4x2, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
|