/external/XNNPACK/src/f32-conv-hwc2chw/ |
D | 3x3s2p1c3x4-neonfma-2x2.c | 151 const float32x4_t vk10c1 = vld1q_f32(w + 20); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() local 153 vo0x0 = vfmaq_laneq_f32(vo0x0, vk10c1, vi1x0, 2); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 154 vo1x0 = vfmaq_laneq_f32(vo1x0, vk10c1, vi3x0, 2); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 155 vo0x1 = vfmaq_laneq_f32(vo0x1, vk10c1, vi1x2, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 156 vo1x1 = vfmaq_laneq_f32(vo1x1, vk10c1, vi3x2, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 413 const float32x4_t vk10c1 = vld1q_f32(w + 20); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() local 415 vo0x0 = vfmaq_laneq_f32(vo0x0, vk10c1, vi1x0, 2); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 416 vo1x0 = vfmaq_laneq_f32(vo1x0, vk10c1, vi3x0, 2); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 417 vo0x1 = vfmaq_laneq_f32(vo0x1, vk10c1, vi1x2, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 418 vo1x1 = vfmaq_laneq_f32(vo1x1, vk10c1, vi3x2, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2()
|
D | 3x3s2p1c3x4-neon-2x2.c | 151 const float32x4_t vk10c1 = vld1q_f32(w + 20); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() local 153 vo0x0 = vmlaq_lane_f32(vo0x0, vk10c1, vget_high_f32(vi1x0), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 154 vo1x0 = vmlaq_lane_f32(vo1x0, vk10c1, vget_high_f32(vi3x0), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 155 vo0x1 = vmlaq_lane_f32(vo0x1, vk10c1, vget_low_f32(vi1x2), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 156 vo1x1 = vmlaq_lane_f32(vo1x1, vk10c1, vget_low_f32(vi3x2), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 411 const float32x4_t vk10c1 = vld1q_f32(w + 20); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() local 413 vo0x0 = vmlaq_lane_f32(vo0x0, vk10c1, vget_high_f32(vi1x0), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 414 vo1x0 = vmlaq_lane_f32(vo1x0, vk10c1, vget_high_f32(vi3x0), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 415 vo0x1 = vmlaq_lane_f32(vo0x1, vk10c1, vget_low_f32(vi1x2), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 416 vo1x1 = vmlaq_lane_f32(vo1x1, vk10c1, vget_low_f32(vi3x2), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2()
|
D | 3x3s2p1c3x4-wasmsimd-2x2.c | 152 const v128_t vk10c1 = wasm_v128_load(w + 20); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 154 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x0, vi1x0, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 155 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi3x0, vi3x0, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 156 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 157 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi3x2, vi3x2, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 413 const v128_t vk10c1 = wasm_v128_load(w + 20); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 415 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x0, vi1x0, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 416 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi3x0, vi3x0, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 417 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 418 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi3x2, vi3x2, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2()
|
D | 3x3s2p1c3x4-sse-2x2.c | 151 const __m128 vk10c1 = _mm_load_ps(w + 20); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() local 153 …vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 154 …vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 155 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 156 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 413 const __m128 vk10c1 = _mm_load_ps(w + 20); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() local 415 …vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x0, vi1x0, _MM_SHUFFLE(2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 416 …vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x0, vi3x0, _MM_SHUFFLE(2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 417 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 418 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi3x2, vi3x2, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2()
|