/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-3x4.c | 168 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() local 171 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 173 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 235 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() local 238 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 241 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 249 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 258 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4() 261 *o2 = wasm_f32x4_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_3x4()
|
D | 3x3p1-minmax-scalar-3x1.c | 134 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 138 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 140 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 170 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() local 174 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1() 176 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_3x1()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-3x4.c | 157 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() local 160 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 162 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 231 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() local 234 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 237 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 242 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 248 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4() 251 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_3x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-3x4.c | 168 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() local 171 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 173 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 235 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() local 238 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 241 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 249 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 258 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4() 261 *o2 = wasm_f32x4_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_3x4()
|
D | 3x3s2p1-minmax-scalar-3x1.c | 158 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() local 162 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 164 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 202 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() local 206 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1() 208 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_3x1()
|
D | 3x3p1-minmax-ssse3-3x4.c | 161 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() local 165 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 167 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 230 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() local 234 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 237 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 245 _mm_storel_pi((__m64*) o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 254 vo2 = _mm_movehl_ps(vo2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4() 257 _mm_store_ss(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-loadsplat-3x4.c | 200 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() local 203 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 205 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 293 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() local 296 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 300 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 305 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 311 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4() 314 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_3x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 192 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 196 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 201 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 275 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 279 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 285 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 295 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 304 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 310 *o2 = wasm_f32x4_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
|
D | 3x3p1-minmax-scalar-4x1.c | 154 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 159 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 163 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 199 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 204 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 208 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3p1-minmax-sse-3x4.c | 203 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() local 207 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 209 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 304 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() local 308 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 311 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 319 _mm_storel_pi((__m64*) o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 328 vo2 = _mm_movehl_ps(vo2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4() 331 _mm_store_ss(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_3x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-3x4.c | 157 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() local 160 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 162 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 231 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() local 234 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 237 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 242 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 248 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4() 251 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-3x4.c | 190 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() local 193 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 195 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 283 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() local 286 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 290 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 295 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 301 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4() 304 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_3x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-4x4.c | 180 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() local 184 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 188 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 270 v128_t vo2 = wasm_v128_bitselect(vmin, vo2p0, wasm_f32x4_lt(vo2p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() local 274 vo2 = wasm_v128_bitselect(vo2, vmax, wasm_f32x4_le(vo2, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 279 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 285 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 291 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 296 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
|
D | 3x3p1-minmax-scalar-5x1.c | 174 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 180 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 186 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 228 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 234 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 240 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
|
D | 3x3s2p1-minmax-scalar-4x1.c | 187 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 192 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 196 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 242 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 247 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 251 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1()
|
D | 5x5p2-minmax-scalar-3x1.c | 248 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 252 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 254 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 351 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 355 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 357 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 416 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() local 420 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1() 422 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1()
|
D | 3x3p1-minmax-ssse3-4x4.c | 185 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 190 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 195 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 270 __m128 vo2 = _mm_max_ps(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 275 vo2 = _mm_min_ps(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 281 _mm_storeu_ps(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 291 _mm_storel_pi((__m64*) o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 300 vo2 = _mm_movehl_ps(vo2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 306 _mm_store_ss(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 192 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 196 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 201 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 275 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 279 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 285 wasm_v128_store(o2, vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 295 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 304 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 310 *o2 = wasm_f32x4_extract_lane(vo2, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-loadsplat-3x4.c | 200 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() local 203 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 205 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 293 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() local 296 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 300 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 305 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 311 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4() 314 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_3x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-3x4.c | 190 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local 193 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 195 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 283 v128_t vo2 = wasm_f32x4_max(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() local 286 vo2 = wasm_f32x4_min(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 290 wasm_v128_store(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 295 *((double*) o2) = wasm_f64x2_extract_lane(vo2, 0); o2 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 301 vo2 = wasm_v32x4_shuffle(vo2, vo2, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4() 304 *o2 = wasm_f32x4_extract_lane(vo2, 0); o2 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_3x4()
|
D | 5x5p2-minmax-scalar-3x1-acc2.c | 251 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 255 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 257 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 357 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 361 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 363 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 425 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() local 429 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2() 431 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_3x1_acc2()
|
D | 5x5s2p2-minmax-scalar-3x1.c | 281 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 285 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 287 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 369 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 373 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 375 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 430 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() local 434 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1() 436 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1()
|
D | 5x5s2p2-minmax-scalar-3x1-acc2.c | 284 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 288 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 290 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 375 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 379 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 381 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 439 float vo2 = math_max_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() local 443 vo2 = math_min_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2() 445 *o2++ = vo2; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_3x1_acc2()
|
D | 3x3p1-minmax-neonfma-3x4.c | 154 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() local 158 vo2 = vminq_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 160 vst1q_f32(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 230 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() local 234 vo2 = vminq_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 237 vst1q_f32(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 243 float32x2_t vo2_lo = vget_low_f32(vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4() 251 vo2_lo = vget_high_f32(vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4()
|
D | 3x3p1-minmax-neon-3x4.c | 154 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() local 158 vo2 = vminq_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 160 vst1q_f32(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 230 float32x4_t vo2 = vmaxq_f32(vo2p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() local 234 vo2 = vminq_f32(vo2, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 237 vst1q_f32(o2, vo2); o2 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 243 float32x2_t vo2_lo = vget_low_f32(vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4() 251 vo2_lo = vget_high_f32(vo2); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_3x4()
|