/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-neonfma-2x4.c | 87 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() local 97 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 100 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 103 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 106 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 109 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 119 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 122 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 125 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() 128 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_2x4() [all …]
|
D | 5x5p2-minmax-neon-2x4.c | 87 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() local 97 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 100 vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 103 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 106 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 109 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 119 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 122 vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 125 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() 128 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_2x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-2x4.c | 116 v128_t vo1p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() local 126 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 129 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 132 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 135 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 138 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 148 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 151 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 154 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() 157 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_2x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-2x4.c | 116 v128_t vo1p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() local 126 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 129 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 132 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 135 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 138 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 148 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 151 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 154 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() 157 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_2x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-2x4.c | 90 v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() local 100 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 106 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 109 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 112 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 122 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 125 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 128 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() 131 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_2x4() [all …]
|
D | 5x5p2-minmax-scalar-2x1.c | 123 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() local 125 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 127 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 129 vo1p0 += vi4x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 131 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 141 vo1p0 += vi1x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 143 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 145 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 147 vo1p0 += vi4x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() 149 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-2x4.c | 90 v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() local 100 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 103 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 106 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 109 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 112 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 122 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 125 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 128 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() 131 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_2x4() [all …]
|
D | 5x5p2-minmax-sse-2x4.c | 112 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() local 114 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 116 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 118 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 120 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 150 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 152 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 154 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 156 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() 158 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_2x4() [all …]
|
D | 5x5s2p2-minmax-scalar-2x1.c | 147 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() local 149 vo1p0 += vi3x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 151 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 153 vo1p0 += vi5x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 155 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 166 vo1p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 168 vo1p0 += vi3x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 170 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 172 vo1p0 += vi5x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() 174 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1() [all …]
|
D | 5x5s2p2-minmax-neon-2x4.c | 110 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() local 113 vo1p0 = vmlaq_lane_f32(vo1p0, vi2x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 116 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 119 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 122 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 125 vo1p0 = vmlaq_lane_f32(vo1p0, vi6x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 128 vo1p0 = vmlaq_lane_f32(vo1p0, vi2x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 131 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 134 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() 137 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_2x4() [all …]
|
D | 5x5s2p2-minmax-neonfma-2x4.c | 110 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() local 113 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 116 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 119 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 122 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 125 vo1p0 = vfmaq_lane_f32(vo1p0, vi6x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 128 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 131 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 134 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() 137 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_2x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 124 v128_t vo1p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 136 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 140 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 144 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 148 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 152 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 164 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 168 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 172 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 176 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() [all …]
|
D | 5x5p2-minmax-neonfma-3x4.c | 95 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 107 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 111 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 115 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 119 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 123 vo1p0 = vfmaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 135 vo1p0 = vfmaq_lane_f32(vo1p0, vi1x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 139 vo1p0 = vfmaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 143 vo1p0 = vfmaq_lane_f32(vo1p0, vi3x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 147 vo1p0 = vfmaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 124 v128_t vo1p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 136 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 140 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 144 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 148 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 152 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 164 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 168 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 172 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 176 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() [all …]
|
D | 5x5p2-minmax-neon-3x4.c | 95 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 107 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 111 vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 115 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 119 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 123 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 135 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 139 vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 143 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 147 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4.c | 162 v128_t vo1p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() local 165 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 168 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 171 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 174 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 177 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 180 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 183 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 186 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 189 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4.c | 162 v128_t vo1p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() local 165 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 168 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 171 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 174 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 177 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 180 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 183 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 186 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 189 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-2x4.c | 142 v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() local 145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 148 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 151 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 157 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 160 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 163 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 166 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 169 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() [all …]
|
D | 5x5p2-minmax-scalar-3x1.c | 134 float vo1p0 = vbias + vi1x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 137 vo1p0 += vi2x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 140 vo1p0 += vi3x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 143 vo1p0 += vi4x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 146 vo1p0 += vi5x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 158 vo1p0 += vi1x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 161 vo1p0 += vi2x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 164 vo1p0 += vi3x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 167 vo1p0 += vi4x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 170 vo1p0 += vi5x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-2x4.c | 142 v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() local 145 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 148 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 151 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 157 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 160 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 163 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 166 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 169 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() [all …]
|
D | 5x5p2-minmax-sse-3x4.c | 121 __m128 vo1p0 = _mm_add_ps(vbias, _mm_mul_ps(vi1x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 124 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 127 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 130 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 133 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 168 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi1x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 171 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi2x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 174 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi3x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 177 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi4x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 180 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() [all …]
|
D | 5x5s2p2-minmax-scalar-3x1.c | 169 float vo1p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 172 vo1p0 += vi3x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 175 vo1p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 178 vo1p0 += vi5x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 181 vo1p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 195 vo1p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 198 vo1p0 += vi3x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 201 vo1p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 204 vo1p0 += vi5x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 207 vo1p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-3x4.c | 98 v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() local 110 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 114 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 118 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 122 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 126 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 138 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 142 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 146 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 150 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() [all …]
|
D | 5x5p2-minmax-neon-4x4.c | 103 float32x4_t vo1p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 117 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 122 vo1p0 = vmlaq_lane_f32(vo1p0, vi2x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 127 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 132 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 137 vo1p0 = vmlaq_lane_f32(vo1p0, vi5x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 151 vo1p0 = vmlaq_lane_f32(vo1p0, vi1x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 156 vo1p0 = vmlaq_lane_f32(vo1p0, vi2x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 161 vo1p0 = vmlaq_lane_f32(vo1p0, vi3x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 166 vo1p0 = vmlaq_lane_f32(vo1p0, vi4x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4.c | 98 v128_t vo1p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() local 110 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 114 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 118 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 122 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 126 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 138 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi1x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 142 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 146 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 150 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() [all …]
|