/external/XNNPACK/src/f32-dwconv/gen/ |
D | up8x25-minmax-fma3.c | 167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() local 174 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 180 vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 186 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 192 vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 198 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 204 vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 210 vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 216 vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() 222 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3() [all …]
|
D | up8x25-minmax-avx.c | 167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() local 174 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 180 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 186 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 192 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 198 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 204 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 210 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 216 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() 222 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx() [all …]
|
D | up16x25-minmax-fma3.c | 167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() local 177 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 186 vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 195 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 204 vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 213 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 222 vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 231 vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 240 vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() 249 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3() [all …]
|
D | up16x25-minmax-avx.c | 167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() local 177 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 186 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 195 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 204 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 213 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 222 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 231 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 240 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() 249 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx() [all …]
|
D | up16x9-minmax-fma3.c | 87 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() local 97 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 106 vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 115 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 124 vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 133 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 142 vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 151 vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 160 vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() 169 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3() [all …]
|
D | up16x9-minmax-avx.c | 87 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() local 97 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 106 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 115 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 124 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 133 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 142 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 151 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 160 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() 169 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x9__avx() [all …]
|
D | up8x9-minmax-avx.c | 87 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() local 94 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 100 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 106 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 112 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 118 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 124 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 130 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 136 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() 142 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x9__avx() [all …]
|
D | up8x9-minmax-fma3.c | 87 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() local 94 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 100 vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 106 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 112 vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 118 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 124 vacc01234567p0 = _mm256_fmadd_ps(vi5x01234567, vk5x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 130 vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 136 vacc01234567p0 = _mm256_fmadd_ps(vi7x01234567, vk7x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() 142 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3() [all …]
|
D | up16x4-minmax-fma3.c | 62 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() local 72 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() 81 vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() 90 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() 99 vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() 105 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() 115 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() local 121 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() 127 vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() 133 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x4__fma3() [all …]
|
D | up8x25-minmax-fma3-acc2.c | 167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() local 174 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 186 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 198 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 210 vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 222 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 234 vacc01234567p0 = _mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 246 vacc01234567p0 = _mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 258 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() 270 vacc01234567p0 = _mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x25__fma3_acc2() [all …]
|
D | up16x4-minmax-avx.c | 62 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() local 72 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() 81 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() 90 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() 99 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() 105 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() 115 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() local 121 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() 127 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() 133 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x4__avx() [all …]
|
D | up16x9-minmax-fma3-acc2.c | 87 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() local 97 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 115 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 133 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 151 vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 169 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 175 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, vacc01234567p1); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 178 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() 188 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() local 194 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x9__fma3_acc2() [all …]
|
D | up16x25-minmax-fma3-acc2.c | 167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() local 177 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 195 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 213 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 231 vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 249 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 267 vacc01234567p0 = _mm256_fmadd_ps(vi10x01234567, vk10x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 285 vacc01234567p0 = _mm256_fmadd_ps(vi12x01234567, vk12x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 303 vacc01234567p0 = _mm256_fmadd_ps(vi14x01234567, vk14x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() 321 vacc01234567p0 = _mm256_fmadd_ps(vi16x01234567, vk16x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up16x25__fma3_acc2() [all …]
|
D | up8x25-minmax-avx-acc2.c | 167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() local 174 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 186 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 198 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 210 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 222 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 234 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi10x01234567, vk10x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 246 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi12x01234567, vk12x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 258 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() 270 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi16x01234567, vk16x01234567)); in xnn_f32_dwconv_minmax_ukernel_up8x25__avx_acc2() [all …]
|
D | up16x25-minmax-avx-acc2.c | 167 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() local 177 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 195 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 213 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 231 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 249 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 267 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi10x01234567, vk10x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 285 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi12x01234567, vk12x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 303 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() 321 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi16x01234567, vk16x01234567)); in xnn_f32_dwconv_minmax_ukernel_up16x25__avx_acc2() [all …]
|
D | up8x9-minmax-fma3-acc2.c | 87 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 94 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 106 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 118 vacc01234567p0 = _mm256_fmadd_ps(vi4x01234567, vk4x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 130 vacc01234567p0 = _mm256_fmadd_ps(vi6x01234567, vk6x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 142 vacc01234567p0 = _mm256_fmadd_ps(vi8x01234567, vk8x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 147 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, vacc01234567p1); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 149 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() 160 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() local 164 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x9__fma3_acc2() [all …]
|
D | up8x4-minmax-fma3.c | 62 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() local 69 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() 75 vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() 81 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() 87 vacc01234567p0 = _mm256_fmadd_ps(vi3x01234567, vk3x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() 92 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() 103 __m256 vacc01234567p0 = _mm256_load_ps(w); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() local 107 vacc01234567p0 = _mm256_fmadd_ps(vi0x01234567, vk0x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() 111 vacc01234567p0 = _mm256_fmadd_ps(vi1x01234567, vk1x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() 115 vacc01234567p0 = _mm256_fmadd_ps(vi2x01234567, vk2x01234567, vacc01234567p0); in xnn_f32_dwconv_minmax_ukernel_up8x4__fma3() [all …]
|
/external/XNNPACK/src/f16-dwconv/gen/ |
D | up8x25-minmax-neonfp16arith.c | 167 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() local 172 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 176 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 180 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 184 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 188 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 192 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 196 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 200 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() 204 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith() [all …]
|
D | up16x25-minmax-neonfp16arith.c | 167 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() local 175 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 182 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 189 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 196 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 203 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 210 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 217 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 224 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() 231 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith() [all …]
|
D | up16x9-minmax-neonfp16arith.c | 87 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() local 95 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 102 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 109 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 116 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 123 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 130 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 137 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 144 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() 151 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith() [all …]
|
D | up8x9-minmax-neonfp16arith.c | 87 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() local 92 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 96 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 100 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 104 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 108 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 112 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi5x01234567, vk5x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 116 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 120 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi7x01234567, vk7x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() 124 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith() [all …]
|
D | up16x4-minmax-neonfp16arith.c | 62 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() local 70 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() 77 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() 84 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() 91 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi3x01234567, vk3x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() 95 float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin); in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() 104 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() local 109 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() 113 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi1x01234567, vk1x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() 117 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith() [all …]
|
D | up8x25-minmax-neonfp16arith-acc2.c | 167 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() local 172 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 180 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 188 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 196 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 204 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 212 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 220 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 228 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() 236 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567); in xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2() [all …]
|
D | up16x25-minmax-neonfp16arith-acc2.c | 167 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() local 175 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 189 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 203 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 217 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 231 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 245 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi10x01234567, vk10x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 259 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi12x01234567, vk12x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 273 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi14x01234567, vk14x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() 287 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi16x01234567, vk16x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2() [all …]
|
D | up16x9-minmax-neonfp16arith-acc2.c | 87 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() local 95 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 109 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi2x01234567, vk2x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 123 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi4x01234567, vk4x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 137 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi6x01234567, vk6x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 151 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi8x01234567, vk8x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 155 vacc01234567p0 = vaddq_f16(vacc01234567p0, vacc01234567p1); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 158 float16x8_t vacc01234567 = vmaxq_f16(vacc01234567p0, vmin); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() 167 float16x8_t vacc01234567p0 = vld1q_f16(w); w += 8; in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() local 172 vacc01234567p0 = vfmaq_f16(vacc01234567p0, vi0x01234567, vk0x01234567); in xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2() [all …]
|