/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 125 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local 137 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 141 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 145 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 149 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 153 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 165 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 173 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() 177 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() [all …]
|
D | 5x5p2-minmax-neonfma-3x4.c | 96 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local 108 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 112 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 116 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 120 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 124 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 136 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 140 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 144 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() 148 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 125 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local 137 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 141 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 145 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 149 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 153 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 165 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 173 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() 177 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() [all …]
|
D | 5x5p2-minmax-neon-3x4.c | 96 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local 108 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 112 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 116 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 120 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 124 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 136 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 140 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 144 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() 148 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() [all …]
|
D | 5x5p2-minmax-scalar-3x1.c | 135 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 138 vo2p0 += vi3x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 141 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 144 vo2p0 += vi5x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 147 vo2p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 159 vo2p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 162 vo2p0 += vi3x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 165 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 168 vo2p0 += vi5x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 171 vo2p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() [all …]
|
D | 5x5p2-minmax-sse-3x4.c | 122 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local 125 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 128 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 131 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 134 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 169 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 172 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 175 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 178 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() 181 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() [all …]
|
D | 5x5s2p2-minmax-scalar-3x1.c | 170 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 173 vo2p0 += vi5x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 176 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 179 vo2p0 += vi7x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 182 vo2p0 += vi8x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 196 vo2p0 += vi4x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 199 vo2p0 += vi5x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 202 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 205 vo2p0 += vi7x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 208 vo2p0 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-3x4.c | 99 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() local 111 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 115 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 119 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 123 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 127 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 139 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 143 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 147 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() 151 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() [all …]
|
D | 5x5p2-minmax-neon-4x4.c | 104 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local 118 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 123 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 128 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 133 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 138 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 152 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 157 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 162 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() 167 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-3x4.c | 99 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() local 111 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 115 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 119 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 123 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 127 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 139 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 143 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 147 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() 151 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() [all …]
|
D | 5x5p2-minmax-neonfma-4x4.c | 104 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local 118 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 123 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 128 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 133 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 138 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 152 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 157 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 162 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() 167 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() [all …]
|
D | 5x5s2p2-minmax-neon-3x4.c | 127 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() local 131 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 135 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 139 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 143 vo2p0 = vmlaq_lane_f32(vo2p0, vi7x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 147 vo2p0 = vmlaq_lane_f32(vo2p0, vi8x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 151 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 155 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 159 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() 163 vo2p0 = vmlaq_lane_f32(vo2p0, vi7x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() [all …]
|
D | 5x5s2p2-minmax-neonfma-3x4.c | 127 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() local 131 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 135 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 139 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 143 vo2p0 = vfmaq_lane_f32(vo2p0, vi7x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 147 vo2p0 = vfmaq_lane_f32(vo2p0, vi8x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 151 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 155 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 159 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() 163 vo2p0 = vfmaq_lane_f32(vo2p0, vi7x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c | 133 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local 147 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 162 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 167 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 181 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() 196 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c | 133 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local 147 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 162 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 167 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 181 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() 196 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 187 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() local 191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 199 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 203 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 207 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 211 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 215 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 219 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 187 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() local 191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 199 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 203 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 207 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 211 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 215 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 219 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() [all …]
|
D | 5x5p2-minmax-sse-4x4.c | 131 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local 135 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 139 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 143 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 147 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 187 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 191 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 195 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 199 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() 203 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4.c | 167 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() local 171 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 179 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 183 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 187 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 191 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 199 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 203 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c | 141 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local 157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 163 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 175 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 181 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 197 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 203 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 209 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() 215 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-x86-splat-4x4.c | 107 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local 121 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 126 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 131 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 136 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 141 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 155 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 160 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 165 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() 170 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4.c | 167 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() local 171 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 179 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 183 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 187 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 191 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 195 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 199 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 203 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() [all …]
|
D | 5x5p2-minmax-neonfma-5x4.c | 112 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local 128 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 134 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 140 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 146 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 152 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 168 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 174 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 180 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() 186 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() [all …]
|
D | 5x5p2-minmax-neon-5x4.c | 112 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local 128 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 134 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 140 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 146 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 152 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 168 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 174 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 180 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() 186 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() [all …]
|
D | 5x5p2-minmax-wasmsimd-arm-splat-4x4.c | 107 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local 121 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 126 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 131 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 136 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 141 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 155 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 160 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 165 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() 170 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() [all …]
|