/external/XNNPACK/src/f32-dwconv2d-chw/gen/ |
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() local 174 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 349 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 370 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 395 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4() 441 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4-acc2.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() local 174 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 351 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 372 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 397 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2() 443 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4-acc2.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() local 174 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 351 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 372 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 397 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2() 443 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() local 174 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 349 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 370 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 395 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4() 441 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4()
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-2x4-acc3.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() local 174 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 353 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 374 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 399 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3() 445 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_2x4_acc3()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-2x4-acc3.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() local 174 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 204 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 205 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 306 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 307 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 353 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 374 vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 399 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3() 445 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_2x4_acc3()
|
D | 5x5s2p2-minmax-sse-2x4-acc2.c | 150 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() local 164 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() 173 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() 301 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() 302 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() 353 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() 365 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() 374 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2() 440 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc2()
|
D | 5x5s2p2-minmax-sse-2x4-acc3.c | 150 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() local 164 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() 173 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() 301 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() 302 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() 355 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() 367 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() 376 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3() 442 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4_acc3()
|
D | 5x5s2p2-minmax-sse-2x4.c | 150 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() local 164 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() 173 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() 301 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() 302 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() 351 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() 363 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() 372 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4() 438 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vzero); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4()
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-2x4-acc2.c | 134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() local 154 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() 184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() 185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() 286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() 287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() 331 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() 352 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() 377 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2() 423 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-2x4.c | 134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() local 154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 329 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 350 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 375 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4() 421 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4()
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-2x4.c | 134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() local 154 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 329 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 350 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 375 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4() 421 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4()
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-2x4-acc2.c | 134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() local 154 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() 184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() 185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() 286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() 287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() 331 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() 352 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() 377 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2() 423 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc2()
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-2x4-acc3.c | 134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() local 154 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() 184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() 185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() 286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() 287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() 333 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() 354 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() 379 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3() 425 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_2x4_acc3()
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-2x4-acc3.c | 134 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() local 154 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() 184 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() 185 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() 286 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() 287 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() 333 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() 354 …vo1p2 = wasm_f32x4_add(vo1p2, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() 379 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3() 425 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vzero, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_2x4_acc3()
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4-acc2.c | 174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() local 195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 202 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 240 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 430 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 451 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() 458 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-loadsplat-3x4.c | 174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() local 195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 202 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 240 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 427 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 448 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() 455 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_loadsplat_3x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4-acc2.c | 174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() local 195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 202 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 240 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 430 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 451 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() 458 vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-loadsplat-3x4.c | 174 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() local 195 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 202 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 239 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 240 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 369 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 427 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 448 vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() 455 vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_loadsplat_3x4() [all …]
|
D | 5x5s2p2-minmax-sse-3x4.c | 170 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() local 186 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 191 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 202 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 369 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 434 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 448 vo2p0 = _mm_add_ps(vo2p0, _mm_mul_ps(vi5x8ACE, vk12)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 453 vo1p0 = _mm_add_ps(vo1p0, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() 464 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4() [all …]
|
D | 5x5s2p2-minmax-sse-3x4-acc2.c | 170 __m128 vi5x8ACE = _mm_shuffle_ps(vi5x89AB, vi5xCDEF, _MM_SHUFFLE(2, 0, 2, 0)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() local 186 __m128 vo2p1 = _mm_mul_ps(vi5x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 191 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 202 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 369 const __m128 vi5xGACE = _mm_move_ss(vi5x8ACE, vi5xGIKM); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 370 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 437 vi5x8ACE = _mm_and_ps(vi5x8ACE, vmask_even); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 451 __m128 vo2p1 = _mm_mul_ps(vi5x8ACE, vk12); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 456 vo1p1 = _mm_add_ps(vo1p1, _mm_mul_ps(vi5x8ACE, vk32)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() 467 const __m128 vi5xE8AC = _mm_shuffle_ps(vi5x8ACE, vi5x8ACE, _MM_SHUFFLE(2, 1, 0, 3)); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4-acc2.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() local 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 182 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 219 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 220 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 349 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 350 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 410 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 431 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() 438 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4-acc2.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() local 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 182 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 219 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 220 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 349 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 350 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 410 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 431 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() 438 …vo1p1 = wasm_f32x4_add(vo1p1, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4_acc2() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-x86-splat-3x4.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() local 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 182 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 219 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 220 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 349 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 350 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 407 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 428 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() 435 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_3x4() [all …]
|
D | 5x5s2p2-minmax-wasmsimd-arm-splat-3x4.c | 154 v128_t vi5x8ACE = wasm_v32x4_shuffle(vi5x89AB, vi5xCDEF, 0, 2, 4, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() local 175 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 182 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 219 const v128_t vi5x68AC = wasm_v32x4_shuffle(vi5x0246, vi5x8ACE, 3, 4, 5, 6); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 220 vi5x0246 = vi5x8ACE; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 349 const v128_t vi5xACEG = wasm_v32x4_shuffle(vi5x8ACE, vi5xGIKM, 1, 2, 3, 4); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 350 vi5x8ACE = vi5xGIKM; in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 407 vi5x8ACE = wasm_v128_and(vmask_even, vi5x8ACE); in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 428 …vo2p0 = wasm_f32x4_add(vo2p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() 435 …vo1p0 = wasm_f32x4_add(vo1p0, wasm_f32x4_mul(vi5x8ACE, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2,… in xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_3x4() [all …]
|