/external/XNNPACK/src/f32-conv-hwc2chw/ |
D | 3x3s2p1c3x4-wasmsimd-2x2.c | 115 const v128_t vi2x1 = wasm_v128_load(i2); i2 += 4; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() local 122 …vo1x1 = wasm_f32x4_add(vo1x1, wasm_f32x4_mul(vk00c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 135 …vo0x1 = wasm_f32x4_add(vo0x1, wasm_f32x4_mul(vk20c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 190 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 203 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 211 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c1, wasm_v32x4_shuffle(vi2x1, vi2x1, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 224 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c1, wasm_v32x4_shuffle(vi2x1, vi2x1, 1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 239 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk01c2, wasm_v32x4_shuffle(vi2x1, vi2x1, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 252 …vo0x0 = wasm_f32x4_add(vo0x0, wasm_f32x4_mul(vk21c2, wasm_v32x4_shuffle(vi2x1, vi2x1, 2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() 260 …vo1x0 = wasm_f32x4_add(vo1x0, wasm_f32x4_mul(vk02c0, wasm_v32x4_shuffle(vi2x1, vi2x1, 3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2() [all …]
|
D | 3x3s2p1c3x4-sse-2x2.c | 114 const __m128 vi2x1 = _mm_loadu_ps(i2); i2 += 4; in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() local 121 …vo1x1 = _mm_add_ps(vo1x1, _mm_mul_ps(vk00c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 134 …vo0x1 = _mm_add_ps(vo0x1, _mm_mul_ps(vk20c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 189 …vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 202 …vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(0, 0, 0, 0))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 210 …vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 223 …vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c1, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(1, 1, 1, 1))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 238 …vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk01c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 251 …vo0x0 = _mm_add_ps(vo0x0, _mm_mul_ps(vk21c2, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(2, 2, 2, 2))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() 259 …vo1x0 = _mm_add_ps(vo1x0, _mm_mul_ps(vk02c0, _mm_shuffle_ps(vi2x1, vi2x1, _MM_SHUFFLE(3, 3, 3, 3))… in xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2() [all …]
|
/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-scalar-3x1.c | 77 float vi2x1 = *i2++; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 101 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 107 vo2p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 109 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 111 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 117 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 159 vo2p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 161 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 163 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1()
|
D | 3x3p1-minmax-scalar-2x1.c | 70 float vi2x1 = *i2++; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() local 89 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 95 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 96 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 101 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 134 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1() 135 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1()
|
D | 3x3p1-minmax-scalar-2x1-acc2.c | 70 float vi2x1 = *i2++; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() local 89 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 95 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 96 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 101 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 136 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2() 137 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2()
|
D | 3x3p1-minmax-scalar-4x1.c | 84 float vi2x1 = *i2++; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 113 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 120 vo2p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 123 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 126 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 133 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 185 vo2p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 188 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 191 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3p1-minmax-scalar-1x1-acc3.c | 63 float vi2x1 = *i2++; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() local 77 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 81 vo0p2 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 85 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3() 109 vo0p2 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3()
|
D | 3x3p1-minmax-scalar-1x1.c | 63 float vi2x1 = *i2++; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() local 77 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 81 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 85 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1() 107 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1()
|
D | 3x3p1-minmax-scalar-1x1-acc2.c | 63 float vi2x1 = *i2++; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() local 77 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 81 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 85 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2() 108 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc2()
|
D | 5x5p2-minmax-scalar-2x1.c | 93 float vi2x1 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 135 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 143 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 144 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 153 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 230 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 238 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 239 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 248 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 310 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() [all …]
|
D | 3x3p1-minmax-scalar-1x1-acc4.c | 63 float vi2x1 = *i2++; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4() local 77 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4() 81 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4() 85 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4() 110 vo0p1 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc4()
|
D | 5x5p2-minmax-scalar-3x1.c | 100 float vi2x1 = 0.0f; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 151 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 159 vo2p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 161 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 163 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 175 vi2x1 = vi2x2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 278 vi2x0 = vi2x1; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 286 vo2p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 288 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 290 vo0p0 += vi2x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() [all …]
|
/external/XNNPACK/src/f32-conv-hwc/gen/ |
D | 3x3s2p1c3x8-neon-2x1.c | 171 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() local 179 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 181 vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c0x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 194 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 196 vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c0x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 203 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 205 vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 218 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 220 vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() 227 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1() [all …]
|
D | 3x3s2p1c3x8-neonfma-2x1.c | 173 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() local 181 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 183 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c0x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 196 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 198 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c0x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 205 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 207 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 220 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 222 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() 229 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1() [all …]
|
D | 3x3s2p0p1c3x8-neon-2x2.c | 106 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() local 116 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 118 vo1x1c4567 = vmlaq_lane_f32(vo1x1c4567, vk00c0x4567, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 141 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 143 vo0x1c4567 = vmlaq_lane_f32(vo0x1c4567, vk20c0x4567, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 155 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 157 vo1x1c4567 = vmlaq_lane_f32(vo1x1c4567, vk00c1x4567, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 180 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 182 vo0x1c4567 = vmlaq_lane_f32(vo0x1c4567, vk20c1x4567, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() 274 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2() [all …]
|
D | 3x3s2p1c3x8-neonfma-2x2.c | 108 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() local 118 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 120 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c0x4567, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 143 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 145 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c0x4567, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 237 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 239 vo1x0c4567 = vfmaq_lane_f32(vo1x0c4567, vk01c0x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 262 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 264 vo0x0c4567 = vfmaq_lane_f32(vo0x0c4567, vk21c0x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() 276 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2() [all …]
|
D | 3x3s2p0p1c3x8-neonfma-2x2.c | 108 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() local 118 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 120 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c0x4567, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 143 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 145 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c0x4567, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 157 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 159 vo1x1c4567 = vfmaq_lane_f32(vo1x1c4567, vk00c1x4567, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 182 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 184 vo0x1c4567 = vfmaq_lane_f32(vo0x1c4567, vk20c1x4567, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() 276 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2() [all …]
|
D | 3x3s2p1c3x8-neon-2x2.c | 106 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() local 116 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 118 vo1x1c4567 = vmlaq_lane_f32(vo1x1c4567, vk00c0x4567, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 141 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 143 vo0x1c4567 = vmlaq_lane_f32(vo0x1c4567, vk20c0x4567, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 235 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 237 vo1x0c4567 = vmlaq_lane_f32(vo1x0c4567, vk01c0x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 260 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 262 vo0x0c4567 = vmlaq_lane_f32(vo0x0c4567, vk21c0x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() 274 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2() [all …]
|
D | 3x3s2p0p1c3x8-neonfma-2x1.c | 200 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() local 205 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 207 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 220 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 222 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 229 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 231 vo1c4567 = vfmaq_lane_f32(vo1c4567, vk01c2x4567, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 244 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 246 vo0c4567 = vfmaq_lane_f32(vo0c4567, vk21c2x4567, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() 253 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1() [all …]
|
D | 3x3s2p0p1c3x8-neon-2x1.c | 198 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() local 203 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 205 vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c1x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 218 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 220 vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c1x4567, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 227 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk01c2x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 229 vo1c4567 = vmlaq_lane_f32(vo1c4567, vk01c2x4567, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 242 vo0c0123 = vmlaq_lane_f32(vo0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 244 vo0c4567 = vmlaq_lane_f32(vo0c4567, vk21c2x4567, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() 251 vo1c0123 = vmlaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x1() [all …]
|
D | 3x3s2p0p1c3x4-neonfma-2x2.c | 103 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() local 111 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 126 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 135 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 150 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 211 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 226 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 235 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 250 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() 266 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2() [all …]
|
D | 3x3s2p1c3x4-neon-2x2.c | 101 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() local 109 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 124 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 185 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 200 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 209 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 224 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 240 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 255 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() 264 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2() [all …]
|
D | 3x3s2p0p1c3x4-neon-2x2.c | 101 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() local 109 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 124 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 133 vo1x1c0123 = vmlaq_lane_f32(vo1x1c0123, vk00c1x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 148 vo0x1c0123 = vmlaq_lane_f32(vo0x1c0123, vk20c1x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 209 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 224 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 233 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 248 vo0x0c0123 = vmlaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() 264 vo1x0c0123 = vmlaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2() [all …]
|
D | 3x3s2p1c3x4-neonfma-2x2.c | 103 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() local 111 vo1x1c0123 = vfmaq_lane_f32(vo1x1c0123, vk00c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() 126 vo0x1c0123 = vfmaq_lane_f32(vo0x1c0123, vk20c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() 187 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() 202 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() 211 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() 226 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() 242 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk01c2x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() 257 vo0x0c0123 = vfmaq_lane_f32(vo0x0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() 266 vo1x0c0123 = vfmaq_lane_f32(vo1x0c0123, vk02c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2() [all …]
|
D | 3x3s2p1c3x4-neonfma-2x1.c | 144 const float32x4_t vi2x1 = vld1q_f32(i2); i2 += 4; in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() local 151 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() 160 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c0x0123, vget_low_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() 166 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() 175 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c1x0123, vget_low_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() 181 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk01c2x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() 190 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk21c2x0123, vget_high_f32(vi2x1), 0); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() 196 vo1c0123 = vfmaq_lane_f32(vo1c0123, vk02c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() 205 vo0c0123 = vfmaq_lane_f32(vo0c0123, vk22c0x0123, vget_high_f32(vi2x1), 1); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() 247 vi2x0 = vcombine_f32(vget_high_f32(vi2x1), vi2x2); in xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1() [all …]
|