Home
last modified time | relevance | path

Searched refs:vo2p0 (Results 1 – 25 of 107) sorted by relevance

12345

/external/XNNPACK/src/f32-dwconv2d-chw/gen/
D5x5p2-minmax-wasmsimd-x86-loadsplat-3x4.c125 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4() local
137 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
141 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
145 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
149 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
153 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
165 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
173 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
177 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_3x4()
[all …]
D5x5p2-minmax-neonfma-3x4.c96 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4() local
108 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
112 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
116 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
120 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
124 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
136 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
140 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
144 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
148 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-3x4.c125 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4() local
137 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
141 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
145 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
149 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
153 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
165 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
173 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
177 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_3x4()
[all …]
D5x5p2-minmax-neon-3x4.c96 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4() local
108 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
112 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
116 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
120 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
124 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
136 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
140 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
144 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
148 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_3x4()
[all …]
D5x5p2-minmax-scalar-3x1.c135 float vo2p0 = vbias + vi2x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local
138 vo2p0 += vi3x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
141 vo2p0 += vi4x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
144 vo2p0 += vi5x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
147 vo2p0 += vi6x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
159 vo2p0 += vi2x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
162 vo2p0 += vi3x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
165 vo2p0 += vi4x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
168 vo2p0 += vi5x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
171 vo2p0 += vi6x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
[all …]
D5x5p2-minmax-sse-3x4.c122 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4() local
125 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
128 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
131 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
134 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
169 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
172 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
175 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
178 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
181 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_3x4()
[all …]
D5x5s2p2-minmax-scalar-3x1.c170 float vo2p0 = vbias + vi4x0 * vk00; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local
173 vo2p0 += vi5x0 * vk10; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
176 vo2p0 += vi6x0 * vk20; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
179 vo2p0 += vi7x0 * vk30; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
182 vo2p0 += vi8x0 * vk40; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
196 vo2p0 += vi4x1 * vk01; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
199 vo2p0 += vi5x1 * vk11; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
202 vo2p0 += vi6x1 * vk21; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
205 vo2p0 += vi7x1 * vk31; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
208 vo2p0 += vi8x1 * vk41; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-3x4.c99 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4() local
111vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
115vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
119vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
123vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
127vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
139vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
143vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
147vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
151vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4()
[all …]
D5x5p2-minmax-neon-4x4.c104 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4() local
118 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
123 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
128 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
133 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
138 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
152 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
157 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
162 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
167 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_4x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-3x4.c99 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4() local
111vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
115vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
119vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
123vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
127vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
139vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
143vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
147vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
151vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4()
[all …]
D5x5p2-minmax-neonfma-4x4.c104 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4() local
118 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
123 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
128 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
133 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
138 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
152 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
157 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
162 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
167 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4()
[all …]
D5x5s2p2-minmax-neon-3x4.c127 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4() local
131 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
135 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
139 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
143 vo2p0 = vmlaq_lane_f32(vo2p0, vi7x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
147 vo2p0 = vmlaq_lane_f32(vo2p0, vi8x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
151 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
155 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
159 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
163 vo2p0 = vmlaq_lane_f32(vo2p0, vi7x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_3x4()
[all …]
D5x5s2p2-minmax-neonfma-3x4.c127 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4() local
131 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x8ACE9BDF.val[0], vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
135 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x8ACE9BDF.val[0], vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
139 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x8ACE9BDF.val[0], vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
143 vo2p0 = vfmaq_lane_f32(vo2p0, vi7x8ACE9BDF.val[0], vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
147 vo2p0 = vfmaq_lane_f32(vo2p0, vi8x8ACE9BDF.val[0], vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
151 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x8ACE9BDF.val[1], vget_low_f32(vw4567), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
155 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x8ACE9BDF.val[1], vget_low_f32(vw89AB), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
159 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x8ACE9BDF.val[1], vget_high_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
163 vo2p0 = vfmaq_lane_f32(vo2p0, vi7x8ACE9BDF.val[1], vget_high_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_3x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-4x4.c133 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4() local
147 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
162 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
167 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
181 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
196 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_4x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-loadsplat-4x4.c133 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4() local
147 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
152 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
162 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
167 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
181 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
186 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
196 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_loadsplat_4x4()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4.c187 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() local
191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
199 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
203 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
207 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
211 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
215 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
219 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4.c187 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() local
191 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x8ACE, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
199 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
203 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
207 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
211 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x9BDF, vk03)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
215 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, vk13)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
219 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, vk23)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
223 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, vk33)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4()
[all …]
D5x5p2-minmax-sse-4x4.c131 __m128 vo2p0 = _mm_add_ps(vbias, _mm_mul_ps(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4() local
135 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
139 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
143 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
147 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
187 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
191 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
195 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
199 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
203 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi6x3456, vk41)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4()
[all …]
D5x5s2p2-minmax-wasmsimd-x86-splat-3x4.c167 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() local
171vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
175vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
179vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
183vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
187vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
191vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
195vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
199vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
203vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-loadsplat-5x4.c141 v128_t vo2p0 = vbias; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4() local
157 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, vk02)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
163 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
169 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, vk22)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
175 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
181 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, vk42)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
197 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, vk01)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
203 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, vk11)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
209 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, vk21)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
215 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, vk31)); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_loadsplat_5x4()
[all …]
D5x5p2-minmax-wasmsimd-x86-splat-4x4.c107 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4() local
121vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
126vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
131vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
136vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
141vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
155vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
160vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
165vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
170vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_4x4()
[all …]
D5x5s2p2-minmax-wasmsimd-arm-splat-3x4.c167 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() local
171vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x8ACE, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
175vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
179vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x8ACE, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
183vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
187vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi8x8ACE, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
191vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x9BDF, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
195vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x9BDF, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
199vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x9BDF, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
203vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi7x9BDF, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4()
[all …]
D5x5p2-minmax-neonfma-5x4.c112 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4() local
128 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
134 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
140 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
146 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
152 vo2p0 = vfmaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
168 vo2p0 = vfmaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
174 vo2p0 = vfmaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
180 vo2p0 = vfmaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
186 vo2p0 = vfmaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_5x4()
[all …]
D5x5p2-minmax-neon-5x4.c112 float32x4_t vo2p0 = vdupq_lane_f32(vget_low_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4() local
128 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x4567, vget_high_f32(vw0123), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
134 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x4567, vget_low_f32(vw89AB), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
140 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x4567, vget_low_f32(vwCDEF), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
146 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x4567, vget_high_f32(vwGHIJ), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
152 vo2p0 = vmlaq_lane_f32(vo2p0, vi6x4567, vget_high_f32(vwKLMN), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
168 vo2p0 = vmlaq_lane_f32(vo2p0, vi2x3456, vget_high_f32(vw0123), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
174 vo2p0 = vmlaq_lane_f32(vo2p0, vi3x3456, vget_high_f32(vw4567), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
180 vo2p0 = vmlaq_lane_f32(vo2p0, vi4x3456, vget_low_f32(vwCDEF), 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
186 vo2p0 = vmlaq_lane_f32(vo2p0, vi5x3456, vget_low_f32(vwGHIJ), 1); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_5x4()
[all …]
D5x5p2-minmax-wasmsimd-arm-splat-4x4.c107 v128_t vo2p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4() local
121vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
126vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
131vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
136vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
141vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi6x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
155vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi2x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
160vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi3x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
165vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi4x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
170vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1,… in xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_4x4()
[all …]

12345