Home
last modified time | relevance | path

Searched refs:va5 (Results 1 – 25 of 148) sorted by relevance

123456

/external/XNNPACK/src/f32-gemm/gen/
D6x8s4-minmax-neon.c93 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8s4__neon() local
104 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
110 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
117 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
127 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
133 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
140 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
150 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
156 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
163 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neon()
[all …]
D6x8s4-minmax-neonfma.c93 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma() local
104 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
110 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
117 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
127 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
133 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
140 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
150 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
156 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
163 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma()
[all …]
D6x8s4-minmax-wasmsimd-arm.c100 v128_t va5 = wasm_v128_load(a5); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm() local
112 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
118 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
125 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
135 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
141 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
148 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
158 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
164 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
171 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_arm()
[all …]
D6x8s4-minmax-wasmsimd-x86.c98 v128_t va5 = wasm_v128_load(a5); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86() local
110 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
116 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
123 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
133 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
139 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
146 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
156 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
162 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
169 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemm_minmax_ukernel_6x8s4__wasmsimd_x86()
[all …]
D8x8s4-minmax-neon.c109 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_8x8s4__neon() local
122 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
130 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
139 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
151 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
159 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
168 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
180 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
188 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
197 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neon()
[all …]
D8x8s4-minmax-neonfma.c109 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma() local
122 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
130 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
139 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
151 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
159 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
168 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
180 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
188 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
197 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemm_minmax_ukernel_8x8s4__neonfma()
[all …]
D6x8-minmax-neonfma-lane-ld128.c93 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() local
104 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
110 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
120 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
126 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
136 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
142 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
152 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
158 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128()
167 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld128() local
[all …]
D6x8-minmax-neon-lane-ld128.c93 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() local
104 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
110 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
120 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
126 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
136 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
142 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
152 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
158 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128()
167 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemm_minmax_ukernel_6x8__neon_lane_ld128() local
[all …]
/external/XNNPACK/src/f32-gemm/gen-inc/
D6x8s4inc-minmax-neon.c95 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon() local
106 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
112 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
119 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
129 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
135 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
142 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
152 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
158 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
165 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neon()
[all …]
D6x8s4inc-minmax-neonfma.c95 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma() local
106 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
112 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
119 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
129 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
135 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
142 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
152 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
158 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
165 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_6x8s4__neonfma()
[all …]
D6x8s4inc-minmax-wasmsimd-arm.c102 v128_t va5 = wasm_v128_load(a5); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm() local
114 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
120 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
127 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
137 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
143 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
150 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
160 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
166 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
173 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_arm()
[all …]
D6x8s4inc-minmax-wasmsimd-x86.c100 v128_t va5 = wasm_v128_load(a5); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86() local
112 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
118 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
125 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
135 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
141 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
148 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
158 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
164 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
171 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_gemminc_minmax_ukernel_6x8s4__wasmsimd_x86()
[all …]
D8x8s4inc-minmax-neon.c111 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon() local
124 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
132 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
141 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
153 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
161 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
170 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
182 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
190 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
199 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neon()
[all …]
D8x8s4inc-minmax-neonfma.c111 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma() local
124 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
132 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
141 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
153 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
161 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
170 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
182 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
190 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
199 va5 = vextq_f32(va5, va5, 1); in xnn_f32_gemminc_minmax_ukernel_8x8s4__neonfma()
[all …]
D6x8inc-minmax-neonfma-lane-ld128.c95 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() local
106 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
112 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
122 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
128 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
138 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
144 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
154 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
160 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128()
169 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_minmax_ukernel_6x8__neonfma_lane_ld128() local
[all …]
D6x8inc-minmax-neon-lane-ld128.c95 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() local
106 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
112 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
122 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
128 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
138 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
144 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
154 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
160 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128()
169 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_gemminc_minmax_ukernel_6x8__neon_lane_ld128() local
[all …]
/external/XNNPACK/src/f32-igemm/gen/
D6x8s4-minmax-wasmsimd-arm.c128 v128_t va5 = wasm_v128_load(a5); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm() local
140 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
146 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
153 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
163 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
169 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
176 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
186 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
192 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
199 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_arm()
[all …]
D6x8s4-minmax-neonfma.c120 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma() local
131 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
137 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
144 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
154 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
160 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
167 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
177 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
183 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
190 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma()
[all …]
D6x8s4-minmax-neon.c120 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8s4__neon() local
131 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
137 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
144 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
154 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
160 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
167 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
177 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
183 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
190 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_6x8s4__neon()
[all …]
D6x8s4-minmax-wasmsimd-x86.c126 v128_t va5 = wasm_v128_load(a5); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86() local
138 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
144 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c0)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
151 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
161 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
167 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c1)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
174 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
184 vacc5x0123 = wasm_f32x4_add(vacc5x0123, wasm_f32x4_mul(va5, vb0123c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
190 vacc5x4567 = wasm_f32x4_add(vacc5x4567, wasm_f32x4_mul(va5, vb4567c2)); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
197 va5 = wasm_v32x4_shuffle(va5, va5, 1, 2, 3, 0); in xnn_f32_igemm_minmax_ukernel_6x8s4__wasmsimd_x86()
[all …]
D8x8s4-minmax-neon.c142 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_8x8s4__neon() local
155 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
163 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
172 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
184 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
192 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
201 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
213 vacc5x0123 = vmlaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
221 vacc5x4567 = vmlaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
230 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neon()
[all …]
D8x8s4-minmax-neonfma.c142 float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma() local
155 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
163 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c0); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
172 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
184 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
192 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
201 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
213 vacc5x0123 = vfmaq_f32(vacc5x0123, va5, vb0123c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
221 vacc5x4567 = vfmaq_f32(vacc5x4567, va5, vb4567c2); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
230 va5 = vextq_f32(va5, va5, 1); in xnn_f32_igemm_minmax_ukernel_8x8s4__neonfma()
[all …]
D6x8-minmax-neon-lane-ld128.c121 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() local
132 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
138 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
148 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
154 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
164 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
170 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
180 vacc5x0123 = vmlaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
186 vacc5x4567 = vmlaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128()
195 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_igemm_minmax_ukernel_6x8__neon_lane_ld128() local
[all …]
D6x8-minmax-neonfma-lane-ld128.c121 const float32x4_t va5 = vld1q_f32(a5); a5 += 4; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() local
132 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
138 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c0, vget_low_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
148 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
154 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c1, vget_low_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
164 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
170 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c2, vget_high_f32(va5), 0); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
180 vacc5x0123 = vfmaq_lane_f32(vacc5x0123, vb0123c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
186 vacc5x4567 = vfmaq_lane_f32(vacc5x4567, vb4567c3, vget_high_f32(va5), 1); in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128()
195 const float32x4_t va5 = vld1q_dup_f32(a5); a5 += 1; in xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld128() local
[all …]
/external/XNNPACK/src/f16-gemm/gen/
D6x16-minmax-neonfp16arith-ld64.c95 const float16x4_t va5 = vld1_f16(a5); a5 += 4; in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64() local
106 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
112 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc0, va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
119 const float16x8_t va5c0 = vdupq_lane_f16(va5, 0); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
143 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c1, va5, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
149 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc1, va5, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
156 const float16x8_t va5c1 = vdupq_lane_f16(va5, 1); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
180 vacc5x01234567 = vfmaq_lane_f16(vacc5x01234567, vb01234567c2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
186 vacc5x89ABCDEF = vfmaq_lane_f16(vacc5x89ABCDEF, vb89ABCDEFc2, va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
193 const float16x8_t va5c2 = vdupq_lane_f16(va5, 2); in xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64()
[all …]

123456