1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <arm_neon.h>
9 
10 #include <xnnpack/zip.h>
11 
12 
xnn_x32_zip_x4_ukernel__neon(size_t n,const uint32_t * input,uint32_t * output)13 void xnn_x32_zip_x4_ukernel__neon(
14     size_t n,
15     const uint32_t* input,
16     uint32_t* output)
17 {
18   assert(n != 0);
19   assert(n % 4 == 0);
20 
21   const uint32_t* x = input;
22   const uint32_t* y = (const uint32_t*) ((uintptr_t) x + n);
23   const uint32_t* z = (const uint32_t*) ((uintptr_t) y + n);
24   const uint32_t* w = (const uint32_t*) ((uintptr_t) z + n);
25   uint32_t* o = output;
26 
27   while (n >= 16) {
28     uint32x4x4_t vxyzw;
29     vxyzw.val[0] = vld1q_u32(x); x += 4;
30     vxyzw.val[1] = vld1q_u32(y); y += 4;
31     vxyzw.val[2] = vld1q_u32(z); z += 4;
32     vxyzw.val[3] = vld1q_u32(w); w += 4;
33     vst4q_u32(o, vxyzw); o += 16;
34     n -= 16;
35   }
36   if XNN_UNLIKELY(n != 0) {
37     if (n & 8) {
38       uint32x2x4_t vxyzw;
39       vxyzw.val[0] = vld1_u32(x); x += 2;
40       vxyzw.val[1] = vld1_u32(y); y += 2;
41       vxyzw.val[2] = vld1_u32(z); z += 2;
42       vxyzw.val[3] = vld1_u32(w); w += 2;
43       vst4_u32(o, vxyzw); o += 8;
44     }
45     if (n & 4) {
46       uint32x4_t vxyzw = vld1q_dup_u32(x);
47       vxyzw = vld1q_lane_u32(y, vxyzw, 1);
48       vxyzw = vld1q_lane_u32(z, vxyzw, 2);
49       vxyzw = vld1q_lane_u32(w, vxyzw, 3);
50       vst1q_u32(o, vxyzw);
51     }
52   }
53 }
54