/external/XNNPACK/src/f32-conv-hwc2chw/ |
D | 3x3s2p1c3x4-neonfma-2x2.c | 106 float32x4_t vo0x1 = vo0x0; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() local 120 vo0x1 = vfmaq_laneq_f32(vo0x1, vk00c0, vi0x1, 3); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 127 vo0x1 = vfmaq_laneq_f32(vo0x1, vk10c0, vi1x1, 3); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 134 vo0x1 = vfmaq_laneq_f32(vo0x1, vk20c0, vi2x1, 3); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 148 vo0x1 = vfmaq_laneq_f32(vo0x1, vk00c1, vi0x2, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 155 vo0x1 = vfmaq_laneq_f32(vo0x1, vk10c1, vi1x2, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 162 vo0x1 = vfmaq_laneq_f32(vo0x1, vk20c1, vi2x2, 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 169 vo0x1 = vfmaq_laneq_f32(vo0x1, vk00c2, vi0x2, 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 176 vo0x1 = vfmaq_laneq_f32(vo0x1, vk10c2, vi1x2, 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() 183 vo0x1 = vfmaq_laneq_f32(vo0x1, vk20c2, vi2x2, 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2() [all …]
|
D | 3x3s2p1c3x4-neon-2x2.c | 106 float32x4_t vo0x1 = vo0x0; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() local 120 vo0x1 = vmlaq_lane_f32(vo0x1, vk00c0, vget_high_f32(vi0x1), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 127 vo0x1 = vmlaq_lane_f32(vo0x1, vk10c0, vget_high_f32(vi1x1), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 134 vo0x1 = vmlaq_lane_f32(vo0x1, vk20c0, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 148 vo0x1 = vmlaq_lane_f32(vo0x1, vk00c1, vget_low_f32(vi0x2), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 155 vo0x1 = vmlaq_lane_f32(vo0x1, vk10c1, vget_low_f32(vi1x2), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 162 vo0x1 = vmlaq_lane_f32(vo0x1, vk20c1, vget_low_f32(vi2x2), 0); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 169 vo0x1 = vmlaq_lane_f32(vo0x1, vk00c2, vget_low_f32(vi0x2), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 176 vo0x1 = vmlaq_lane_f32(vo0x1, vk10c2, vget_low_f32(vi1x2), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() 183 vo0x1 = vmlaq_lane_f32(vo0x1, vk20c2, vget_low_f32(vi2x2), 1); in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2() [all …]
|
D | 3x3s2p1c3x4-wasmsimd-2x2.c | 107 v128_t vo0x1 = vo0x0; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 121 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi0x1, vi0x1, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 128 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c0, wasm_v32x4_shuffle(vi1x1, vi1x1, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 135 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 149 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c1, wasm_v32x4_shuffle(vi0x2, vi0x2, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 156 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c1, wasm_v32x4_shuffle(vi1x2, vi1x2, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 163 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c1, wasm_v32x4_shuffle(vi2x2, vi2x2, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 170 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk00c2, wasm_v32x4_shuffle(vi0x2, vi0x2, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 177 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk10c2, wasm_v32x4_shuffle(vi1x2, vi1x2, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 184 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c2, wasm_v32x4_shuffle(vi2x2, vi2x2, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() [all …]
|
D | 3x3s2p1c3x4-sse-2x2.c | 106 __m128 vo0x1 = vo0x0; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() local 120 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi0x1, vi0x1, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 127 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c0, _mm_shuffle_ps(vi1x1, vi1x1, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 134 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 148 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c1, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 155 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c1, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 162 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c1, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 169 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk00c2, _mm_shuffle_ps(vi0x2, vi0x2, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 176 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk10c2, _mm_shuffle_ps(vi1x2, vi1x2, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 183 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c2, _mm_shuffle_ps(vi2x2, vi2x2, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() [all …]
|