1 typedef short int16_t;
2 typedef int int32_t;
3 typedef unsigned char uint8_t;
4 typedef unsigned int uintptr_t;
5 
6 typedef __builtin_neon_hi int16x4_t __attribute__ ((__vector_size__ (8)));
7 typedef __builtin_neon_uqi uint8x8_t __attribute__ ((__vector_size__ (8)));
8 typedef __builtin_neon_uhi uint16x8_t __attribute__ ((__vector_size__ (16)));
9 typedef __builtin_neon_si int32x4_t __attribute__ ((__vector_size__ (16)));
10 typedef __builtin_neon_hi int16x8_t __attribute__ ((__vector_size__ (16)));
11 typedef __builtin_neon_qi int8x8_t __attribute__ ((__vector_size__ (8)));
12 typedef __builtin_neon_si int32x2_t __attribute__ ((__vector_size__ (8)));
13 
14 typedef struct uint8x8x2_t
15 {
16   uint8x8_t val[2];
17 } uint8x8x2_t;
18 typedef struct uint8x8x4_t
19 {
20   uint8x8_t val[4];
21 } uint8x8x4_t;
22 
23 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vaddq_u16(uint16x8_t __a,uint16x8_t __b)24 vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
25 {
26   return (uint16x8_t)__builtin_neon_vaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
27 }
28 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vaddl_s16(int16x4_t __a,int16x4_t __b)29 vaddl_s16 (int16x4_t __a, int16x4_t __b)
30 {
31   return (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1);
32 }
33 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vaddl_u8(uint8x8_t __a,uint8x8_t __b)34 vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
35 {
36   return (uint16x8_t)__builtin_neon_vaddlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
37 }
38 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vaddw_u8(uint16x8_t __a,uint8x8_t __b)39 vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
40 {
41   return (uint16x8_t)__builtin_neon_vaddwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
42 }
43 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vrhadd_u8(uint8x8_t __a,uint8x8_t __b)44 vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
45 {
46   return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
47 }
48 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vsubl_s16(int16x4_t __a,int16x4_t __b)49 vsubl_s16 (int16x4_t __a, int16x4_t __b)
50 {
51   return (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1);
52 }
53 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vsubl_u8(uint8x8_t __a,uint8x8_t __b)54 vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
55 {
56   return (uint16x8_t)__builtin_neon_vsublv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
57 }
58 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vshrn_n_u16(uint16x8_t __a,const int __b)59 vshrn_n_u16 (uint16x8_t __a, const int __b)
60 {
61   return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 0);
62 }
63 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vrshrn_n_s32(int32x4_t __a,const int __b)64 vrshrn_n_s32 (int32x4_t __a, const int __b)
65 {
66   return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 5);
67 }
68 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vshlq_n_s16(int16x8_t __a,const int __b)69 vshlq_n_s16 (int16x8_t __a, const int __b)
70 {
71   return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
72 }
73 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vshll_n_s16(int16x4_t __a,const int __b)74 vshll_n_s16 (int16x4_t __a, const int __b)
75 {
76   return (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1);
77 }
78 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
vshll_n_u8(uint8x8_t __a,const int __b)79 vshll_n_u8 (uint8x8_t __a, const int __b)
80 {
81   return (uint16x8_t)__builtin_neon_vshll_nv8qi ((int8x8_t) __a, __b, 0);
82 }
83 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
vmov_n_s32(int32_t __a)84 vmov_n_s32 (int32_t __a)
85 {
86   return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
87 }
88 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vmov_n_u8(uint8_t __a)89 vmov_n_u8 (uint8_t __a)
90 {
91   return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
92 }
93 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vcombine_s16(int16x4_t __a,int16x4_t __b)94 vcombine_s16 (int16x4_t __a, int16x4_t __b)
95 {
96   return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
97 }
98 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vget_high_s16(int16x8_t __a)99 vget_high_s16 (int16x8_t __a)
100 {
101   return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
102 }
103 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vget_low_s16(int16x8_t __a)104 vget_low_s16 (int16x8_t __a)
105 {
106   return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
107 }
108 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vqmovun_s16(int16x8_t __a)109 vqmovun_s16 (int16x8_t __a)
110 {
111   return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1);
112 }
113 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmovl_s16(int16x4_t __a)114 vmovl_s16 (int16x4_t __a)
115 {
116   return (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1);
117 }
118 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmulq_lane_s32(int32x4_t __a,int32x2_t __b,const int __c)119 vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
120 {
121   return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1);
122 }
123 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vmlal_lane_s16(int32x4_t __a,int16x4_t __b,int16x4_t __c,const int __d)124 vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
125 {
126   return (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1);
127 }
128 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
vqdmlal_lane_s16(int32x4_t __a,int16x4_t __b,int16x4_t __c,const int __d)129 vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
130 {
131   return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1);
132 }
133 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
vld1_s16(const int16_t * __a)134 vld1_s16 (const int16_t * __a)
135 {
136   return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
137 }
138 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vld1_u8(const uint8_t * __a)139 vld1_u8 (const uint8_t * __a)
140 {
141   return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
142 }
143 __extension__ static __inline void __attribute__ ((__always_inline__))
vst2_u8(uint8_t * __a,uint8x8x2_t __b)144 vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
145 {
146   union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
147   __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
148 }
149 __extension__ static __inline void __attribute__ ((__always_inline__))
vst4_u8(uint8_t * __a,uint8x8x4_t __b)150 vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
151 {
152   union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
153   __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
154 }
155 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
vreinterpretq_s16_u16(uint16x8_t __a)156 vreinterpretq_s16_u16 (uint16x8_t __a)
157 {
158   return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
159 }
160 
161 static const int16_t coef[4] = { 89858 / 4, 22014, 45773 / 2, 113618 / 4 };
162 
UpsampleRgbaLinePairNEON(const uint8_t * top_y,const uint8_t * bottom_y,const uint8_t * top_u,const uint8_t * top_v,const uint8_t * cur_u,const uint8_t * cur_v,uint8_t * top_dst,uint8_t * bottom_dst,int len)163 void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len)
164 {
165     int block;
166     uint8_t uv_buf[2 * 32 + 15];
167     uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);
168     const int uv_len = (len + 1) >> 1;
169     const int num_blocks = (uv_len - 1) >> 3;
170     const int leftover = uv_len - num_blocks * 8;
171     const int last_pos = 1 + 16 * num_blocks;
172     const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;
173     const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;
174     const int16x4_t cf16 = vld1_s16(coef);
175     const int32x2_t cf32 = vmov_n_s32(76283);
176     const uint8x8_t u16 = vmov_n_u8(16);
177     const uint8x8_t u128 = vmov_n_u8(128);
178     for (block = 0; block < num_blocks; ++block) {
179         {
180             uint8x8_t a = vld1_u8(top_u);
181             uint8x8_t b = vld1_u8(top_u + 1);
182             uint8x8_t c = vld1_u8(cur_u);
183             uint8x8_t d = vld1_u8(cur_u + 1);
184             uint16x8_t al = vshll_n_u8(a, 1);
185             uint16x8_t bl = vshll_n_u8(b, 1);
186             uint16x8_t cl = vshll_n_u8(c, 1);
187             uint16x8_t dl = vshll_n_u8(d, 1);
188             uint8x8_t diag1, diag2;
189             uint16x8_t sl;
190             sl = vaddl_u8(a, b);
191             sl = vaddw_u8(sl, c);
192             sl = vaddw_u8(sl, d);
193             al = vaddq_u16(sl, al);
194             bl = vaddq_u16(sl, bl);
195             al = vaddq_u16(al, dl);
196             bl = vaddq_u16(bl, cl);
197             diag2 = vshrn_n_u16(al, 3);
198             diag1 = vshrn_n_u16(bl, 3);
199             a = vrhadd_u8(a, diag1);
200             b = vrhadd_u8(b, diag2);
201             c = vrhadd_u8(c, diag2);
202             d = vrhadd_u8(d, diag1);
203             {
204                 const uint8x8x2_t a_b = {{ a, b }};
205                 const uint8x8x2_t c_d = {{ c, d }};
206                 vst2_u8(r_uv, a_b);
207                 vst2_u8(r_uv + 32, c_d);
208             }
209         }
210         {
211             uint8x8_t a = vld1_u8(top_v);
212             uint8x8_t b = vld1_u8(top_v + 1);
213             uint8x8_t c = vld1_u8(cur_v);
214             uint8x8_t d = vld1_u8(cur_v + 1);
215             uint16x8_t al = vshll_n_u8(a, 1);
216             uint16x8_t bl = vshll_n_u8(b, 1);
217             uint16x8_t cl = vshll_n_u8(c, 1);
218             uint16x8_t dl = vshll_n_u8(d, 1);
219             uint8x8_t diag1, diag2;
220             uint16x8_t sl;
221             sl = vaddl_u8(a, b);
222             sl = vaddw_u8(sl, c);
223             sl = vaddw_u8(sl, d);
224             al = vaddq_u16(sl, al);
225             bl = vaddq_u16(sl, bl);
226             al = vaddq_u16(al, dl);
227             bl = vaddq_u16(bl, cl);
228             diag2 = vshrn_n_u16(al, 3);
229             diag1 = vshrn_n_u16(bl, 3);
230             a = vrhadd_u8(a, diag1);
231             b = vrhadd_u8(b, diag2);
232             c = vrhadd_u8(c, diag2);
233             d = vrhadd_u8(d, diag1);
234             {
235                 const uint8x8x2_t a_b = {{ a, b }};
236                 const uint8x8x2_t c_d = {{ c, d }};
237                 vst2_u8(r_uv + 16, a_b);
238                 vst2_u8(r_uv + 16 + 32, c_d);
239             }
240         }
241         {
242             if (top_y) {
243                 {
244                     int i;
245                     for (i = 0; i < 16; i += 8) {
246                         int off = ((16 * block + 1) + i) * 4;
247                         uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i);
248                         uint8x8_t u = vld1_u8((r_uv) + i);
249                         uint8x8_t v = vld1_u8((r_uv) + i + 16);
250                         int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
251                         int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
252                         int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
253                         int16x8_t ud = vshlq_n_s16(uu, 1);
254                         int16x8_t vd = vshlq_n_s16(vv, 1);
255                         int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
256                         int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
257                         int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
258                         int32x4_t vl = vmovl_s16(vget_low_s16(vv));
259                         int32x4_t vh = vmovl_s16(vget_high_s16(vv));
260                         int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
261                         int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
262                         int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
263                         int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
264                         int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
265                         int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
266                         int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
267                         int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
268                         int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
269                         int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
270                         int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
271                         int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
272                         int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
273                         int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
274                         rl = vmulq_lane_s32(rl, cf32, 0);
275                         rh = vmulq_lane_s32(rh, cf32, 0);
276                         gl = vmulq_lane_s32(gl, cf32, 0);
277                         gh = vmulq_lane_s32(gh, cf32, 0);
278                         bl = vmulq_lane_s32(bl, cf32, 0);
279                         bh = vmulq_lane_s32(bh, cf32, 0);
280                         y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
281                         u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
282                         v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
283                         do {
284                             const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
285                             vst4_u8(top_dst + off, r_g_b_v255);
286                         } while (0);
287                     }
288                 }
289             }
290             if (bottom_y) {
291                 {
292                     int i;
293                     for (i = 0; i < 16; i += 8) {
294                         int off = ((16 * block + 1) + i) * 4;
295                         uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i);
296                         uint8x8_t u = vld1_u8(((r_uv) + 32) + i);
297                         uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16);
298                         int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));
299                         int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));
300                         int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));
301                         int16x8_t ud = vshlq_n_s16(uu, 1);
302                         int16x8_t vd = vshlq_n_s16(vv, 1);
303                         int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0);
304                         int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0);
305                         int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16));
306                         int32x4_t vl = vmovl_s16(vget_low_s16(vv));
307                         int32x4_t vh = vmovl_s16(vget_high_s16(vv));
308                         int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1);
309                         int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);
310                         int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2);
311                         int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);
312                         int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16));
313                         int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3);
314                         int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3);
315                         int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16));
316                         int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr));
317                         int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));
318                         int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc));
319                         int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));
320                         int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub));
321                         int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));
322                         rl = vmulq_lane_s32(rl, cf32, 0);
323                         rh = vmulq_lane_s32(rh, cf32, 0);
324                         gl = vmulq_lane_s32(gl, cf32, 0);
325                         gh = vmulq_lane_s32(gh, cf32, 0);
326                         bl = vmulq_lane_s32(bl, cf32, 0);
327                         bh = vmulq_lane_s32(bh, cf32, 0);
328                         y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16)));
329                         u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16)));
330                         v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16)));
331                         do {
332                             const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }};
333                             vst4_u8(bottom_dst + off, r_g_b_v255);
334                         } while (0);
335                     }
336                 }
337             }
338         }
339     }
340 }
341