/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 193 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 197 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 199 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 276 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() local 280 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 283 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 293 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 305 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4() 308 *o3 = wasm_f32x4_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_4x4()
|
D | 3x3p1-minmax-scalar-4x1.c | 155 float vo3 = math_max_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 160 vo3 = math_min_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 162 *o3++ = vo3; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 200 float vo3 = math_max_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() local 205 vo3 = math_min_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1() 207 *o3++ = vo3; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-4x4.c | 181 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() local 185 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 187 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 271 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() local 275 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 278 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 284 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 292 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4() 295 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_4x4()
|
D | 3x3p1-minmax-scalar-5x1.c | 175 float vo3 = math_max_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 181 vo3 = math_min_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 185 *o3++ = vo3; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 229 float vo3 = math_max_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() local 235 vo3 = math_min_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1() 239 *o3++ = vo3; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_5x1()
|
D | 3x3s2p1-minmax-scalar-4x1.c | 188 float vo3 = math_max_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 193 vo3 = math_min_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 195 *o3++ = vo3; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 243 float vo3 = math_max_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() local 248 vo3 = math_min_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1() 250 *o3++ = vo3; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_4x1()
|
D | 3x3p1-minmax-ssse3-4x4.c | 186 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 191 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 193 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 271 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() local 276 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 279 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 289 _mm_storel_pi((__m64*) o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 301 vo3 = _mm_movehl_ps(vo3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4() 304 _mm_store_ss(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 193 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 197 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 199 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 276 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() local 280 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 283 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 293 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 305 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4() 308 *o3 = wasm_f32x4_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_4x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-5x4.c | 217 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 222 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 227 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 316 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() local 321 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 327 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 339 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 351 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4() 357 *o3 = wasm_f32x4_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_5x4()
|
D | 3x3p1-minmax-sse-4x4.c | 236 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4() local 241 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4() 243 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4() 359 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4() local 364 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4() 367 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4() 377 _mm_storel_pi((__m64*) o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4() 389 vo3 = _mm_movehl_ps(vo3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4() 392 _mm_store_ss(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-5x4.c | 217 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 222 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 227 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 316 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() local 321 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 327 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 339 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 351 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4() 357 *o3 = wasm_f32x4_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_5x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-4x4.c | 181 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() local 185 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 187 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 271 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() local 275 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 278 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 284 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 292 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4() 295 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_4x4()
|
D | 3x3p1-minmax-wasmsimd-x86-loadsplat-6x4.c | 241 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 247 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 255 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 356 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() local 362 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 371 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 385 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 397 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4() 406 *o3 = wasm_f32x4_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_6x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-loadsplat-4x4.c | 237 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() local 241 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 243 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 353 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() local 357 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 361 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 367 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 375 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4() 378 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_loadsplat_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-x86-splat-4x4.c | 227 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() local 231 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 233 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 343 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() local 347 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 351 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 357 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 365 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4() 368 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_4x4()
|
D | 3x3p1-minmax-wasmsimd-x86-splat-5x4.c | 204 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() local 209 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 213 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 310 v128_t vo3 = wasm_v128_bitselect(vmin, vo3p0, wasm_f32x4_lt(vo3p0, vmin)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() local 315 vo3 = wasm_v128_bitselect(vo3, vmax, wasm_f32x4_le(vo3, vmax)); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 320 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 327 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 335 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4() 340 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_splat_5x4()
|
D | 3x3p1-minmax-scalar-6x1.c | 195 float vo3 = math_max_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local 202 vo3 = math_min_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 208 *o3++ = vo3; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 258 float vo3 = math_max_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() local 265 vo3 = math_min_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1() 271 *o3++ = vo3; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_6x1()
|
D | 3x3p1-minmax-ssse3-5x4.c | 210 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local 216 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 221 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 311 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() local 317 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 323 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 335 _mm_storel_pi((__m64*) o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 347 vo3 = _mm_movehl_ps(vo3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4() 353 _mm_store_ss(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_5x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-loadsplat-4x4.c | 237 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() local 241 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 243 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 353 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() local 357 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 361 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 367 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 375 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4() 378 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_loadsplat_4x4()
|
D | 3x3p1-minmax-sse-5x4.c | 268 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local 274 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 279 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 413 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() local 419 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 425 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 437 _mm_storel_pi((__m64*) o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 449 vo3 = _mm_movehl_ps(vo3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4() 455 _mm_store_ss(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_5x4()
|
D | 3x3p1-minmax-neonfma-4x4.c | 178 float32x4_t vo3 = vmaxq_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() local 183 vo3 = vminq_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 185 vst1q_f32(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 270 float32x4_t vo3 = vmaxq_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() local 275 vo3 = vminq_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 278 vst1q_f32(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 286 float32x2_t vo3_lo = vget_low_f32(vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4() 296 vo3_lo = vget_high_f32(vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_4x4()
|
D | 3x3p1-minmax-neon-4x4.c | 178 float32x4_t vo3 = vmaxq_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() local 183 vo3 = vminq_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 185 vst1q_f32(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 270 float32x4_t vo3 = vmaxq_f32(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() local 275 vo3 = vminq_f32(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 278 vst1q_f32(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 286 float32x2_t vo3_lo = vget_low_f32(vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4() 296 vo3_lo = vget_high_f32(vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_4x4()
|
D | 3x3s2p1-minmax-wasmsimd-arm-splat-4x4.c | 227 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() local 231 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 233 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 343 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() local 347 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 351 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 357 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 365 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4() 368 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_4x4()
|
D | 3x3p1-minmax-wasmsimd-arm-loadsplat-6x4.c | 241 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 247 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 255 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 356 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() local 362 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 371 wasm_v128_store(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 385 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 397 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4() 406 *o3 = wasm_f32x4_extract_lane(vo3, 0); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_6x4()
|
D | 3x3p1-minmax-wasmsimd-arm-splat-5x4.c | 204 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() local 209 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 213 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 310 v128_t vo3 = wasm_f32x4_max(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() local 315 vo3 = wasm_f32x4_min(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 320 wasm_v128_store(o3, vo3); o3 += 4; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 327 *((double*) o3) = wasm_f64x2_extract_lane(vo3, 0); o3 += 2; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 335 vo3 = wasm_v32x4_shuffle(vo3, vo3, 2, 3, 0, 1); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4() 340 *o3 = wasm_f32x4_extract_lane(vo3, 0); o3 += 1; in xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_splat_5x4()
|
D | 3x3p1-minmax-ssse3-6x4.c | 234 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 241 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 249 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 351 __m128 vo3 = _mm_max_ps(vo3p0, vmin); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() local 358 vo3 = _mm_min_ps(vo3, vmax); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 367 _mm_storeu_ps(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 381 _mm_storel_pi((__m64*) o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 393 vo3 = _mm_movehl_ps(vo3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4() 402 _mm_store_ss(o3, vo3); in xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_6x4()
|