/external/XNNPACK/src/f32-gemm/gen/ |
D | 6x8s4-minmax-neon.c | 93 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() local 104 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 110 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 117 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 127 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 133 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 140 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 150 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 156 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() 163 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() [all …]
|
D | 6x8s4-minmax-neonfma.c | 93 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() local 104 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 110 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 117 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 127 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 133 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 140 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 150 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 156 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() 163 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() [all …]
|
D | 6x8s4-minmax-wasmsimd-arm.c | 100 v128_t va5 = wasm_v128_load(a5); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() local 112 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 118 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 125 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 135 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 141 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 148 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 158 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 164 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() 171 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() [all …]
|
D | 6x8s4-minmax-wasmsimd-x86.c | 98 v128_t va5 = wasm_v128_load(a5); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() local 110 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 116 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 123 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 133 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 139 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 146 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 156 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 162 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() 169 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() [all …]
|
D | 8x8s4-minmax-neon.c | 109 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() local 122 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 130 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 139 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 151 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 159 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 168 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 180 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 188 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() 197 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() [all …]
|
D | 8x8s4-minmax-neonfma.c | 109 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() local 122 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 130 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 139 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 151 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 159 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 168 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 180 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 188 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() 197 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() [all …]
|
D | 6x8-minmax-neonfma-lane-ld128.c | 93 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() local 104 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 110 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 120 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 126 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 136 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 142 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 152 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 158 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() 167 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() local [all …]
|
D | 6x8-minmax-neon-lane-ld128.c | 93 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() local 104 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 110 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 120 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 126 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 136 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 142 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 152 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 158 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() 167 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() local [all …]
|
/external/XNNPACK/src/f32-gemm/gen-inc/ |
D | 6x8s4inc-minmax-neon.c | 95 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() local 106 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 112 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 119 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 129 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 135 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 142 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 152 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 158 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() 165 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() [all …]
|
D | 6x8s4inc-minmax-neonfma.c | 95 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() local 106 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 112 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 119 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 129 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 135 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 142 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 152 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 158 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() 165 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() [all …]
|
D | 6x8s4inc-minmax-wasmsimd-arm.c | 102 v128_t va5 = wasm_v128_load(a5); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() local 114 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 120 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 127 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 137 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 143 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 150 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 160 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 166 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() 173 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() [all …]
|
D | 6x8s4inc-minmax-wasmsimd-x86.c | 100 v128_t va5 = wasm_v128_load(a5); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() local 112 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 118 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 125 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 135 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 141 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 148 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 158 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 164 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() 171 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() [all …]
|
D | 8x8s4inc-minmax-neon.c | 111 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() local 124 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 132 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 141 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 153 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 161 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 170 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 182 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 190 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() 199 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() [all …]
|
D | 8x8s4inc-minmax-neonfma.c | 111 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() local 124 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 132 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 141 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 153 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 161 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 170 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 182 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 190 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() 199 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() [all …]
|
D | 6x8inc-minmax-neonfma-lane-ld128.c | 95 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() local 106 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 112 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 122 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 128 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 138 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 144 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 154 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 160 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() 169 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() local [all …]
|
D | 6x8inc-minmax-neon-lane-ld128.c | 95 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() local 106 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 112 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 122 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 128 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 138 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 144 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 154 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 160 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() 169 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() local [all …]
|
/external/XNNPACK/src/f32-igemm/gen/ |
D | 6x8s4-minmax-wasmsimd-arm.c | 128 v128_t va5 = wasm_v128_load(a5); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() local 140 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 146 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 153 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 163 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 169 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 176 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 186 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 192 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() 199 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() [all …]
|
D | 6x8s4-minmax-neonfma.c | 120 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() local 131 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() 137 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() 144 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() 154 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() 160 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() 167 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() 177 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() 183 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() 190 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() [all …]
|
D | 6x8s4-minmax-neon.c | 120 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() local 131 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() 137 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() 144 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() 154 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() 160 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() 167 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() 177 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() 183 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() 190 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() [all …]
|
D | 6x8s4-minmax-wasmsimd-x86.c | 126 v128_t va5 = wasm_v128_load(a5); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() local 138 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 144 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 151 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 161 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 167 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 174 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 184 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 190 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() 197 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() [all …]
|
D | 8x8s4-minmax-neon.c | 142 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() local 155 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 163 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 172 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 184 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 192 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 201 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 213 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 221 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() 230 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() [all …]
|
D | 8x8s4-minmax-neonfma.c | 142 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() local 155 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 163 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 172 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 184 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 192 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 201 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 213 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 221 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() 230 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() [all …]
|
D | 6x8-minmax-neon-lane-ld128.c | 121 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() local 132 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 138 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 148 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 154 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 164 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 170 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 180 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 186 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() 195 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() local [all …]
|
D | 6x8-minmax-neonfma-lane-ld128.c | 121 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() local 132 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 138 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 148 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 154 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 164 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 170 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 180 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 186 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() 195 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() local [all …]
|
/external/XNNPACK/src/f16-gemm/gen/ |
D | 6x16-minmax-neonfp16arith-ld64.c | 95 const float16x4_t va5 = vld1_f16(a5); a5 += 4; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local 106 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 112 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 119 const float16x8_t va5c0 = vdupq_lane_f16(va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 143 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 149 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc1, va5, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 156 const float16x8_t va5c1 = vdupq_lane_f16(va5, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 180 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 186 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() 193 const float16x8_t va5c2 = vdupq_lane_f16(va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() [all …]
|