1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
20 !defined(__aarch64__)
21
22 // NEON downscalers with interpolation.
23 // Provided by Fritz Koenig
24
25 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)26 void ScaleRowDown2_NEON(const uint8* src_ptr,
27 ptrdiff_t src_stride,
28 uint8* dst,
29 int dst_width) {
30 (void)src_stride;
31 asm volatile (
32 "1: \n"
33 // load even pixels into q0, odd into q1
34 MEMACCESS(0)
35 "vld2.8 {q0, q1}, [%0]! \n"
36 "subs %2, %2, #16 \n" // 16 processed per loop
37 MEMACCESS(1)
38 "vst1.8 {q1}, [%1]! \n" // store odd pixels
39 "bgt 1b \n"
40 : "+r"(src_ptr), // %0
41 "+r"(dst), // %1
42 "+r"(dst_width) // %2
43 :
44 : "q0", "q1" // Clobber List
45 );
46 }
47
48 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)49 void ScaleRowDown2Linear_NEON(const uint8* src_ptr,
50 ptrdiff_t src_stride,
51 uint8* dst,
52 int dst_width) {
53 (void)src_stride;
54 asm volatile (
55 "1: \n"
56 MEMACCESS(0)
57 "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc
58 "subs %2, %2, #16 \n" // 16 processed per loop
59 "vpaddl.u8 q0, q0 \n" // add adjacent
60 "vpaddl.u8 q1, q1 \n"
61 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
62 "vrshrn.u16 d1, q1, #1 \n"
63 MEMACCESS(1)
64 "vst1.8 {q0}, [%1]! \n"
65 "bgt 1b \n"
66 : "+r"(src_ptr), // %0
67 "+r"(dst), // %1
68 "+r"(dst_width) // %2
69 :
70 : "q0", "q1" // Clobber List
71 );
72 }
73
74 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)75 void ScaleRowDown2Box_NEON(const uint8* src_ptr,
76 ptrdiff_t src_stride,
77 uint8* dst,
78 int dst_width) {
79 asm volatile (
80 // change the stride to row 2 pointer
81 "add %1, %0 \n"
82 "1: \n"
83 MEMACCESS(0)
84 "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
85 MEMACCESS(1)
86 "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
87 "subs %3, %3, #16 \n" // 16 processed per loop
88 "vpaddl.u8 q0, q0 \n" // row 1 add adjacent
89 "vpaddl.u8 q1, q1 \n"
90 "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1
91 "vpadal.u8 q1, q3 \n"
92 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
93 "vrshrn.u16 d1, q1, #2 \n"
94 MEMACCESS(2)
95 "vst1.8 {q0}, [%2]! \n"
96 "bgt 1b \n"
97 : "+r"(src_ptr), // %0
98 "+r"(src_stride), // %1
99 "+r"(dst), // %2
100 "+r"(dst_width) // %3
101 :
102 : "q0", "q1", "q2", "q3" // Clobber List
103 );
104 }
105
ScaleRowDown4_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)106 void ScaleRowDown4_NEON(const uint8* src_ptr,
107 ptrdiff_t src_stride,
108 uint8* dst_ptr,
109 int dst_width) {
110 (void)src_stride;
111 asm volatile (
112 "1: \n"
113 MEMACCESS(0)
114 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
115 "subs %2, %2, #8 \n" // 8 processed per loop
116 MEMACCESS(1)
117 "vst1.8 {d2}, [%1]! \n"
118 "bgt 1b \n"
119 : "+r"(src_ptr), // %0
120 "+r"(dst_ptr), // %1
121 "+r"(dst_width) // %2
122 :
123 : "q0", "q1", "memory", "cc"
124 );
125 }
126
ScaleRowDown4Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)127 void ScaleRowDown4Box_NEON(const uint8* src_ptr,
128 ptrdiff_t src_stride,
129 uint8* dst_ptr,
130 int dst_width) {
131 const uint8* src_ptr1 = src_ptr + src_stride;
132 const uint8* src_ptr2 = src_ptr + src_stride * 2;
133 const uint8* src_ptr3 = src_ptr + src_stride * 3;
134 asm volatile (
135 "1: \n"
136 MEMACCESS(0)
137 "vld1.8 {q0}, [%0]! \n" // load up 16x4
138 MEMACCESS(3)
139 "vld1.8 {q1}, [%3]! \n"
140 MEMACCESS(4)
141 "vld1.8 {q2}, [%4]! \n"
142 MEMACCESS(5)
143 "vld1.8 {q3}, [%5]! \n"
144 "subs %2, %2, #4 \n"
145 "vpaddl.u8 q0, q0 \n"
146 "vpadal.u8 q0, q1 \n"
147 "vpadal.u8 q0, q2 \n"
148 "vpadal.u8 q0, q3 \n"
149 "vpaddl.u16 q0, q0 \n"
150 "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
151 "vmovn.u16 d0, q0 \n"
152 MEMACCESS(1)
153 "vst1.32 {d0[0]}, [%1]! \n"
154 "bgt 1b \n"
155 : "+r"(src_ptr), // %0
156 "+r"(dst_ptr), // %1
157 "+r"(dst_width), // %2
158 "+r"(src_ptr1), // %3
159 "+r"(src_ptr2), // %4
160 "+r"(src_ptr3) // %5
161 :
162 : "q0", "q1", "q2", "q3", "memory", "cc"
163 );
164 }
165
166 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
167 // to load up the every 4th pixel into a 4 different registers.
168 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)169 void ScaleRowDown34_NEON(const uint8* src_ptr,
170 ptrdiff_t src_stride,
171 uint8* dst_ptr,
172 int dst_width) {
173 (void)src_stride;
174 asm volatile (
175 "1: \n"
176 MEMACCESS(0)
177 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
178 "subs %2, %2, #24 \n"
179 "vmov d2, d3 \n" // order d0, d1, d2
180 MEMACCESS(1)
181 "vst3.8 {d0, d1, d2}, [%1]! \n"
182 "bgt 1b \n"
183 : "+r"(src_ptr), // %0
184 "+r"(dst_ptr), // %1
185 "+r"(dst_width) // %2
186 :
187 : "d0", "d1", "d2", "d3", "memory", "cc"
188 );
189 }
190
ScaleRowDown34_0_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)191 void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
192 ptrdiff_t src_stride,
193 uint8* dst_ptr,
194 int dst_width) {
195 asm volatile (
196 "vmov.u8 d24, #3 \n"
197 "add %3, %0 \n"
198 "1: \n"
199 MEMACCESS(0)
200 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
201 MEMACCESS(3)
202 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
203 "subs %2, %2, #24 \n"
204
205 // filter src line 0 with src line 1
206 // expand chars to shorts to allow for room
207 // when adding lines together
208 "vmovl.u8 q8, d4 \n"
209 "vmovl.u8 q9, d5 \n"
210 "vmovl.u8 q10, d6 \n"
211 "vmovl.u8 q11, d7 \n"
212
213 // 3 * line_0 + line_1
214 "vmlal.u8 q8, d0, d24 \n"
215 "vmlal.u8 q9, d1, d24 \n"
216 "vmlal.u8 q10, d2, d24 \n"
217 "vmlal.u8 q11, d3, d24 \n"
218
219 // (3 * line_0 + line_1) >> 2
220 "vqrshrn.u16 d0, q8, #2 \n"
221 "vqrshrn.u16 d1, q9, #2 \n"
222 "vqrshrn.u16 d2, q10, #2 \n"
223 "vqrshrn.u16 d3, q11, #2 \n"
224
225 // a0 = (src[0] * 3 + s[1] * 1) >> 2
226 "vmovl.u8 q8, d1 \n"
227 "vmlal.u8 q8, d0, d24 \n"
228 "vqrshrn.u16 d0, q8, #2 \n"
229
230 // a1 = (src[1] * 1 + s[2] * 1) >> 1
231 "vrhadd.u8 d1, d1, d2 \n"
232
233 // a2 = (src[2] * 1 + s[3] * 3) >> 2
234 "vmovl.u8 q8, d2 \n"
235 "vmlal.u8 q8, d3, d24 \n"
236 "vqrshrn.u16 d2, q8, #2 \n"
237
238 MEMACCESS(1)
239 "vst3.8 {d0, d1, d2}, [%1]! \n"
240
241 "bgt 1b \n"
242 : "+r"(src_ptr), // %0
243 "+r"(dst_ptr), // %1
244 "+r"(dst_width), // %2
245 "+r"(src_stride) // %3
246 :
247 : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
248 );
249 }
250
ScaleRowDown34_1_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)251 void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
252 ptrdiff_t src_stride,
253 uint8* dst_ptr,
254 int dst_width) {
255 asm volatile (
256 "vmov.u8 d24, #3 \n"
257 "add %3, %0 \n"
258 "1: \n"
259 MEMACCESS(0)
260 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
261 MEMACCESS(3)
262 "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
263 "subs %2, %2, #24 \n"
264 // average src line 0 with src line 1
265 "vrhadd.u8 q0, q0, q2 \n"
266 "vrhadd.u8 q1, q1, q3 \n"
267
268 // a0 = (src[0] * 3 + s[1] * 1) >> 2
269 "vmovl.u8 q3, d1 \n"
270 "vmlal.u8 q3, d0, d24 \n"
271 "vqrshrn.u16 d0, q3, #2 \n"
272
273 // a1 = (src[1] * 1 + s[2] * 1) >> 1
274 "vrhadd.u8 d1, d1, d2 \n"
275
276 // a2 = (src[2] * 1 + s[3] * 3) >> 2
277 "vmovl.u8 q3, d2 \n"
278 "vmlal.u8 q3, d3, d24 \n"
279 "vqrshrn.u16 d2, q3, #2 \n"
280
281 MEMACCESS(1)
282 "vst3.8 {d0, d1, d2}, [%1]! \n"
283 "bgt 1b \n"
284 : "+r"(src_ptr), // %0
285 "+r"(dst_ptr), // %1
286 "+r"(dst_width), // %2
287 "+r"(src_stride) // %3
288 :
289 : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
290 );
291 }
292
293 #define HAS_SCALEROWDOWN38_NEON
294 static uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
295 static uvec8 kShuf38_2 = {0, 8, 16, 2, 10, 17, 4, 12,
296 18, 6, 14, 19, 0, 0, 0, 0};
297 static vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
298 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12};
299 static vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
300 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18};
301
302 // 32 -> 12
ScaleRowDown38_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)303 void ScaleRowDown38_NEON(const uint8* src_ptr,
304 ptrdiff_t src_stride,
305 uint8* dst_ptr,
306 int dst_width) {
307 (void)src_stride;
308 asm volatile (
309 MEMACCESS(3)
310 "vld1.8 {q3}, [%3] \n"
311 "1: \n"
312 MEMACCESS(0)
313 "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
314 "subs %2, %2, #12 \n"
315 "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
316 "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
317 MEMACCESS(1)
318 "vst1.8 {d4}, [%1]! \n"
319 MEMACCESS(1)
320 "vst1.32 {d5[0]}, [%1]! \n"
321 "bgt 1b \n"
322 : "+r"(src_ptr), // %0
323 "+r"(dst_ptr), // %1
324 "+r"(dst_width) // %2
325 : "r"(&kShuf38) // %3
326 : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
327 );
328 }
329
330 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)331 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
332 ptrdiff_t src_stride,
333 uint8* dst_ptr,
334 int dst_width) {
335 const uint8* src_ptr1 = src_ptr + src_stride * 2;
336
337 asm volatile (
338 MEMACCESS(5)
339 "vld1.16 {q13}, [%5] \n"
340 MEMACCESS(6)
341 "vld1.8 {q14}, [%6] \n"
342 MEMACCESS(7)
343 "vld1.8 {q15}, [%7] \n"
344 "add %3, %0 \n"
345 "1: \n"
346
347 // d0 = 00 40 01 41 02 42 03 43
348 // d1 = 10 50 11 51 12 52 13 53
349 // d2 = 20 60 21 61 22 62 23 63
350 // d3 = 30 70 31 71 32 72 33 73
351 MEMACCESS(0)
352 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
353 MEMACCESS(3)
354 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
355 MEMACCESS(4)
356 "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
357 "subs %2, %2, #12 \n"
358
359 // Shuffle the input data around to get align the data
360 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
361 // d0 = 00 10 01 11 02 12 03 13
362 // d1 = 40 50 41 51 42 52 43 53
363 "vtrn.u8 d0, d1 \n"
364 "vtrn.u8 d4, d5 \n"
365 "vtrn.u8 d16, d17 \n"
366
367 // d2 = 20 30 21 31 22 32 23 33
368 // d3 = 60 70 61 71 62 72 63 73
369 "vtrn.u8 d2, d3 \n"
370 "vtrn.u8 d6, d7 \n"
371 "vtrn.u8 d18, d19 \n"
372
373 // d0 = 00+10 01+11 02+12 03+13
374 // d2 = 40+50 41+51 42+52 43+53
375 "vpaddl.u8 q0, q0 \n"
376 "vpaddl.u8 q2, q2 \n"
377 "vpaddl.u8 q8, q8 \n"
378
379 // d3 = 60+70 61+71 62+72 63+73
380 "vpaddl.u8 d3, d3 \n"
381 "vpaddl.u8 d7, d7 \n"
382 "vpaddl.u8 d19, d19 \n"
383
384 // combine source lines
385 "vadd.u16 q0, q2 \n"
386 "vadd.u16 q0, q8 \n"
387 "vadd.u16 d4, d3, d7 \n"
388 "vadd.u16 d4, d19 \n"
389
390 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
391 // + s[6 + st * 1] + s[7 + st * 1]
392 // + s[6 + st * 2] + s[7 + st * 2]) / 6
393 "vqrdmulh.s16 q2, q2, q13 \n"
394 "vmovn.u16 d4, q2 \n"
395
396 // Shuffle 2,3 reg around so that 2 can be added to the
397 // 0,1 reg and 3 can be added to the 4,5 reg. This
398 // requires expanding from u8 to u16 as the 0,1 and 4,5
399 // registers are already expanded. Then do transposes
400 // to get aligned.
401 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
402 "vmovl.u8 q1, d2 \n"
403 "vmovl.u8 q3, d6 \n"
404 "vmovl.u8 q9, d18 \n"
405
406 // combine source lines
407 "vadd.u16 q1, q3 \n"
408 "vadd.u16 q1, q9 \n"
409
410 // d4 = xx 20 xx 30 xx 22 xx 32
411 // d5 = xx 21 xx 31 xx 23 xx 33
412 "vtrn.u32 d2, d3 \n"
413
414 // d4 = xx 20 xx 21 xx 22 xx 23
415 // d5 = xx 30 xx 31 xx 32 xx 33
416 "vtrn.u16 d2, d3 \n"
417
418 // 0+1+2, 3+4+5
419 "vadd.u16 q0, q1 \n"
420
421 // Need to divide, but can't downshift as the the value
422 // isn't a power of 2. So multiply by 65536 / n
423 // and take the upper 16 bits.
424 "vqrdmulh.s16 q0, q0, q15 \n"
425
426 // Align for table lookup, vtbl requires registers to
427 // be adjacent
428 "vmov.u8 d2, d4 \n"
429
430 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
431 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
432
433 MEMACCESS(1)
434 "vst1.8 {d3}, [%1]! \n"
435 MEMACCESS(1)
436 "vst1.32 {d4[0]}, [%1]! \n"
437 "bgt 1b \n"
438 : "+r"(src_ptr), // %0
439 "+r"(dst_ptr), // %1
440 "+r"(dst_width), // %2
441 "+r"(src_stride), // %3
442 "+r"(src_ptr1) // %4
443 : "r"(&kMult38_Div6), // %5
444 "r"(&kShuf38_2), // %6
445 "r"(&kMult38_Div9) // %7
446 : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
447 );
448 }
449
450 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst_ptr,int dst_width)451 void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
452 ptrdiff_t src_stride,
453 uint8* dst_ptr,
454 int dst_width) {
455 asm volatile (
456 MEMACCESS(4)
457 "vld1.16 {q13}, [%4] \n"
458 MEMACCESS(5)
459 "vld1.8 {q14}, [%5] \n"
460 "add %3, %0 \n"
461 "1: \n"
462
463 // d0 = 00 40 01 41 02 42 03 43
464 // d1 = 10 50 11 51 12 52 13 53
465 // d2 = 20 60 21 61 22 62 23 63
466 // d3 = 30 70 31 71 32 72 33 73
467 MEMACCESS(0)
468 "vld4.8 {d0, d1, d2, d3}, [%0]! \n"
469 MEMACCESS(3)
470 "vld4.8 {d4, d5, d6, d7}, [%3]! \n"
471 "subs %2, %2, #12 \n"
472
473 // Shuffle the input data around to get align the data
474 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
475 // d0 = 00 10 01 11 02 12 03 13
476 // d1 = 40 50 41 51 42 52 43 53
477 "vtrn.u8 d0, d1 \n"
478 "vtrn.u8 d4, d5 \n"
479
480 // d2 = 20 30 21 31 22 32 23 33
481 // d3 = 60 70 61 71 62 72 63 73
482 "vtrn.u8 d2, d3 \n"
483 "vtrn.u8 d6, d7 \n"
484
485 // d0 = 00+10 01+11 02+12 03+13
486 // d2 = 40+50 41+51 42+52 43+53
487 "vpaddl.u8 q0, q0 \n"
488 "vpaddl.u8 q2, q2 \n"
489
490 // d3 = 60+70 61+71 62+72 63+73
491 "vpaddl.u8 d3, d3 \n"
492 "vpaddl.u8 d7, d7 \n"
493
494 // combine source lines
495 "vadd.u16 q0, q2 \n"
496 "vadd.u16 d4, d3, d7 \n"
497
498 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
499 "vqrshrn.u16 d4, q2, #2 \n"
500
501 // Shuffle 2,3 reg around so that 2 can be added to the
502 // 0,1 reg and 3 can be added to the 4,5 reg. This
503 // requires expanding from u8 to u16 as the 0,1 and 4,5
504 // registers are already expanded. Then do transposes
505 // to get aligned.
506 // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
507 "vmovl.u8 q1, d2 \n"
508 "vmovl.u8 q3, d6 \n"
509
510 // combine source lines
511 "vadd.u16 q1, q3 \n"
512
513 // d4 = xx 20 xx 30 xx 22 xx 32
514 // d5 = xx 21 xx 31 xx 23 xx 33
515 "vtrn.u32 d2, d3 \n"
516
517 // d4 = xx 20 xx 21 xx 22 xx 23
518 // d5 = xx 30 xx 31 xx 32 xx 33
519 "vtrn.u16 d2, d3 \n"
520
521 // 0+1+2, 3+4+5
522 "vadd.u16 q0, q1 \n"
523
524 // Need to divide, but can't downshift as the the value
525 // isn't a power of 2. So multiply by 65536 / n
526 // and take the upper 16 bits.
527 "vqrdmulh.s16 q0, q0, q13 \n"
528
529 // Align for table lookup, vtbl requires registers to
530 // be adjacent
531 "vmov.u8 d2, d4 \n"
532
533 "vtbl.u8 d3, {d0, d1, d2}, d28 \n"
534 "vtbl.u8 d4, {d0, d1, d2}, d29 \n"
535
536 MEMACCESS(1)
537 "vst1.8 {d3}, [%1]! \n"
538 MEMACCESS(1)
539 "vst1.32 {d4[0]}, [%1]! \n"
540 "bgt 1b \n"
541 : "+r"(src_ptr), // %0
542 "+r"(dst_ptr), // %1
543 "+r"(dst_width), // %2
544 "+r"(src_stride) // %3
545 : "r"(&kMult38_Div6), // %4
546 "r"(&kShuf38_2) // %5
547 : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
548 );
549 }
550
ScaleAddRows_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint16 * dst_ptr,int src_width,int src_height)551 void ScaleAddRows_NEON(const uint8* src_ptr,
552 ptrdiff_t src_stride,
553 uint16* dst_ptr,
554 int src_width,
555 int src_height) {
556 const uint8* src_tmp;
557 asm volatile (
558 "1: \n"
559 "mov %0, %1 \n"
560 "mov r12, %5 \n"
561 "veor q2, q2, q2 \n"
562 "veor q3, q3, q3 \n"
563 "2: \n"
564 // load 16 pixels into q0
565 MEMACCESS(0)
566 "vld1.8 {q0}, [%0], %3 \n"
567 "vaddw.u8 q3, q3, d1 \n"
568 "vaddw.u8 q2, q2, d0 \n"
569 "subs r12, r12, #1 \n"
570 "bgt 2b \n"
571 MEMACCESS(2)
572 "vst1.16 {q2, q3}, [%2]! \n" // store pixels
573 "add %1, %1, #16 \n"
574 "subs %4, %4, #16 \n" // 16 processed per loop
575 "bgt 1b \n"
576 : "=&r"(src_tmp), // %0
577 "+r"(src_ptr), // %1
578 "+r"(dst_ptr), // %2
579 "+r"(src_stride), // %3
580 "+r"(src_width), // %4
581 "+r"(src_height) // %5
582 :
583 : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List
584 );
585 }
586
587 // clang-format off
588 // TODO(Yang Zhang): Investigate less load instructions for
589 // the x/dx stepping
590 #define LOAD2_DATA8_LANE(n) \
591 "lsr %5, %3, #16 \n" \
592 "add %6, %1, %5 \n" \
593 "add %3, %3, %4 \n" \
594 MEMACCESS(6) \
595 "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n"
596 // clang-format on
597
598 // The NEON version mimics this formula (from row_common.cc):
599 // #define BLENDER(a, b, f) (uint8)((int)(a) +
600 // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
601
ScaleFilterCols_NEON(uint8 * dst_ptr,const uint8 * src_ptr,int dst_width,int x,int dx)602 void ScaleFilterCols_NEON(uint8* dst_ptr,
603 const uint8* src_ptr,
604 int dst_width,
605 int x,
606 int dx) {
607 int dx_offset[4] = {0, 1, 2, 3};
608 int* tmp = dx_offset;
609 const uint8* src_tmp = src_ptr;
610 asm volatile (
611 "vdup.32 q0, %3 \n" // x
612 "vdup.32 q1, %4 \n" // dx
613 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
614 "vshl.i32 q3, q1, #2 \n" // 4 * dx
615 "vmul.s32 q1, q1, q2 \n"
616 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
617 "vadd.s32 q1, q1, q0 \n"
618 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
619 "vadd.s32 q2, q1, q3 \n"
620 "vshl.i32 q0, q3, #1 \n" // 8 * dx
621 "1: \n"
622 LOAD2_DATA8_LANE(0)
623 LOAD2_DATA8_LANE(1)
624 LOAD2_DATA8_LANE(2)
625 LOAD2_DATA8_LANE(3)
626 LOAD2_DATA8_LANE(4)
627 LOAD2_DATA8_LANE(5)
628 LOAD2_DATA8_LANE(6)
629 LOAD2_DATA8_LANE(7)
630 "vmov q10, q1 \n"
631 "vmov q11, q2 \n"
632 "vuzp.16 q10, q11 \n"
633 "vmovl.u8 q8, d6 \n"
634 "vmovl.u8 q9, d7 \n"
635 "vsubl.s16 q11, d18, d16 \n"
636 "vsubl.s16 q12, d19, d17 \n"
637 "vmovl.u16 q13, d20 \n"
638 "vmovl.u16 q10, d21 \n"
639 "vmul.s32 q11, q11, q13 \n"
640 "vmul.s32 q12, q12, q10 \n"
641 "vrshrn.s32 d18, q11, #16 \n"
642 "vrshrn.s32 d19, q12, #16 \n"
643 "vadd.s16 q8, q8, q9 \n"
644 "vmovn.s16 d6, q8 \n"
645
646 MEMACCESS(0)
647 "vst1.8 {d6}, [%0]! \n" // store pixels
648 "vadd.s32 q1, q1, q0 \n"
649 "vadd.s32 q2, q2, q0 \n"
650 "subs %2, %2, #8 \n" // 8 processed per loop
651 "bgt 1b \n"
652 : "+r"(dst_ptr), // %0
653 "+r"(src_ptr), // %1
654 "+r"(dst_width), // %2
655 "+r"(x), // %3
656 "+r"(dx), // %4
657 "+r"(tmp), // %5
658 "+r"(src_tmp) // %6
659 :
660 : "memory", "cc", "q0", "q1", "q2", "q3",
661 "q8", "q9", "q10", "q11", "q12", "q13"
662 );
663 }
664
665 #undef LOAD2_DATA8_LANE
666
667 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)668 void ScaleFilterRows_NEON(uint8* dst_ptr,
669 const uint8* src_ptr,
670 ptrdiff_t src_stride,
671 int dst_width,
672 int source_y_fraction) {
673 asm volatile (
674 "cmp %4, #0 \n"
675 "beq 100f \n"
676 "add %2, %1 \n"
677 "cmp %4, #64 \n"
678 "beq 75f \n"
679 "cmp %4, #128 \n"
680 "beq 50f \n"
681 "cmp %4, #192 \n"
682 "beq 25f \n"
683
684 "vdup.8 d5, %4 \n"
685 "rsb %4, #256 \n"
686 "vdup.8 d4, %4 \n"
687 // General purpose row blend.
688 "1: \n"
689 MEMACCESS(1)
690 "vld1.8 {q0}, [%1]! \n"
691 MEMACCESS(2)
692 "vld1.8 {q1}, [%2]! \n"
693 "subs %3, %3, #16 \n"
694 "vmull.u8 q13, d0, d4 \n"
695 "vmull.u8 q14, d1, d4 \n"
696 "vmlal.u8 q13, d2, d5 \n"
697 "vmlal.u8 q14, d3, d5 \n"
698 "vrshrn.u16 d0, q13, #8 \n"
699 "vrshrn.u16 d1, q14, #8 \n"
700 MEMACCESS(0)
701 "vst1.8 {q0}, [%0]! \n"
702 "bgt 1b \n"
703 "b 99f \n"
704
705 // Blend 25 / 75.
706 "25: \n"
707 MEMACCESS(1)
708 "vld1.8 {q0}, [%1]! \n"
709 MEMACCESS(2)
710 "vld1.8 {q1}, [%2]! \n"
711 "subs %3, %3, #16 \n"
712 "vrhadd.u8 q0, q1 \n"
713 "vrhadd.u8 q0, q1 \n"
714 MEMACCESS(0)
715 "vst1.8 {q0}, [%0]! \n"
716 "bgt 25b \n"
717 "b 99f \n"
718
719 // Blend 50 / 50.
720 "50: \n"
721 MEMACCESS(1)
722 "vld1.8 {q0}, [%1]! \n"
723 MEMACCESS(2)
724 "vld1.8 {q1}, [%2]! \n"
725 "subs %3, %3, #16 \n"
726 "vrhadd.u8 q0, q1 \n"
727 MEMACCESS(0)
728 "vst1.8 {q0}, [%0]! \n"
729 "bgt 50b \n"
730 "b 99f \n"
731
732 // Blend 75 / 25.
733 "75: \n"
734 MEMACCESS(1)
735 "vld1.8 {q1}, [%1]! \n"
736 MEMACCESS(2)
737 "vld1.8 {q0}, [%2]! \n"
738 "subs %3, %3, #16 \n"
739 "vrhadd.u8 q0, q1 \n"
740 "vrhadd.u8 q0, q1 \n"
741 MEMACCESS(0)
742 "vst1.8 {q0}, [%0]! \n"
743 "bgt 75b \n"
744 "b 99f \n"
745
746 // Blend 100 / 0 - Copy row unchanged.
747 "100: \n"
748 MEMACCESS(1)
749 "vld1.8 {q0}, [%1]! \n"
750 "subs %3, %3, #16 \n"
751 MEMACCESS(0)
752 "vst1.8 {q0}, [%0]! \n"
753 "bgt 100b \n"
754
755 "99: \n"
756 MEMACCESS(0)
757 "vst1.8 {d1[7]}, [%0] \n"
758 : "+r"(dst_ptr), // %0
759 "+r"(src_ptr), // %1
760 "+r"(src_stride), // %2
761 "+r"(dst_width), // %3
762 "+r"(source_y_fraction) // %4
763 :
764 : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
765 );
766 }
767
ScaleARGBRowDown2_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)768 void ScaleARGBRowDown2_NEON(const uint8* src_ptr,
769 ptrdiff_t src_stride,
770 uint8* dst,
771 int dst_width) {
772 (void)src_stride;
773 asm volatile (
774 "1: \n"
775 // load even pixels into q0, odd into q1
776 MEMACCESS(0)
777 "vld2.32 {q0, q1}, [%0]! \n"
778 MEMACCESS(0)
779 "vld2.32 {q2, q3}, [%0]! \n"
780 "subs %2, %2, #8 \n" // 8 processed per loop
781 MEMACCESS(1)
782 "vst1.8 {q1}, [%1]! \n" // store odd pixels
783 MEMACCESS(1)
784 "vst1.8 {q3}, [%1]! \n"
785 "bgt 1b \n"
786 : "+r"(src_ptr), // %0
787 "+r"(dst), // %1
788 "+r"(dst_width) // %2
789 :
790 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
791 );
792 }
793
ScaleARGBRowDown2Linear_NEON(const uint8 * src_argb,ptrdiff_t src_stride,uint8 * dst_argb,int dst_width)794 void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb,
795 ptrdiff_t src_stride,
796 uint8* dst_argb,
797 int dst_width) {
798 (void)src_stride;
799 asm volatile (
800 "1: \n"
801 MEMACCESS(0)
802 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
803 MEMACCESS(0)
804 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
805 "subs %2, %2, #8 \n" // 8 processed per loop
806 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
807 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
808 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
809 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
810 "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack
811 "vrshrn.u16 d1, q1, #1 \n"
812 "vrshrn.u16 d2, q2, #1 \n"
813 "vrshrn.u16 d3, q3, #1 \n"
814 MEMACCESS(1)
815 "vst4.8 {d0, d1, d2, d3}, [%1]! \n"
816 "bgt 1b \n"
817 : "+r"(src_argb), // %0
818 "+r"(dst_argb), // %1
819 "+r"(dst_width) // %2
820 :
821 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
822 );
823 }
824
ScaleARGBRowDown2Box_NEON(const uint8 * src_ptr,ptrdiff_t src_stride,uint8 * dst,int dst_width)825 void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr,
826 ptrdiff_t src_stride,
827 uint8* dst,
828 int dst_width) {
829 asm volatile (
830 // change the stride to row 2 pointer
831 "add %1, %1, %0 \n"
832 "1: \n"
833 MEMACCESS(0)
834 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
835 MEMACCESS(0)
836 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
837 "subs %3, %3, #8 \n" // 8 processed per loop.
838 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
839 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
840 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
841 "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
842 MEMACCESS(1)
843 "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
844 MEMACCESS(1)
845 "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
846 "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
847 "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
848 "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts.
849 "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts.
850 "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
851 "vrshrn.u16 d1, q1, #2 \n"
852 "vrshrn.u16 d2, q2, #2 \n"
853 "vrshrn.u16 d3, q3, #2 \n"
854 MEMACCESS(2)
855 "vst4.8 {d0, d1, d2, d3}, [%2]! \n"
856 "bgt 1b \n"
857 : "+r"(src_ptr), // %0
858 "+r"(src_stride), // %1
859 "+r"(dst), // %2
860 "+r"(dst_width) // %3
861 :
862 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
863 );
864 }
865
866 // Reads 4 pixels at a time.
867 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)868 void ScaleARGBRowDownEven_NEON(const uint8* src_argb,
869 ptrdiff_t src_stride,
870 int src_stepx,
871 uint8* dst_argb,
872 int dst_width) {
873 (void)src_stride;
874 asm volatile (
875 "mov r12, %3, lsl #2 \n"
876 "1: \n"
877 MEMACCESS(0)
878 "vld1.32 {d0[0]}, [%0], r12 \n"
879 MEMACCESS(0)
880 "vld1.32 {d0[1]}, [%0], r12 \n"
881 MEMACCESS(0)
882 "vld1.32 {d1[0]}, [%0], r12 \n"
883 MEMACCESS(0)
884 "vld1.32 {d1[1]}, [%0], r12 \n"
885 "subs %2, %2, #4 \n" // 4 pixels per loop.
886 MEMACCESS(1)
887 "vst1.8 {q0}, [%1]! \n"
888 "bgt 1b \n"
889 : "+r"(src_argb), // %0
890 "+r"(dst_argb), // %1
891 "+r"(dst_width) // %2
892 : "r"(src_stepx) // %3
893 : "memory", "cc", "r12", "q0"
894 );
895 }
896
897 // Reads 4 pixels at a time.
898 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEvenBox_NEON(const uint8 * src_argb,ptrdiff_t src_stride,int src_stepx,uint8 * dst_argb,int dst_width)899 void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb,
900 ptrdiff_t src_stride,
901 int src_stepx,
902 uint8* dst_argb,
903 int dst_width) {
904 asm volatile (
905 "mov r12, %4, lsl #2 \n"
906 "add %1, %1, %0 \n"
907 "1: \n"
908 MEMACCESS(0)
909 "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
910 MEMACCESS(1)
911 "vld1.8 {d1}, [%1], r12 \n"
912 MEMACCESS(0)
913 "vld1.8 {d2}, [%0], r12 \n"
914 MEMACCESS(1)
915 "vld1.8 {d3}, [%1], r12 \n"
916 MEMACCESS(0)
917 "vld1.8 {d4}, [%0], r12 \n"
918 MEMACCESS(1)
919 "vld1.8 {d5}, [%1], r12 \n"
920 MEMACCESS(0)
921 "vld1.8 {d6}, [%0], r12 \n"
922 MEMACCESS(1)
923 "vld1.8 {d7}, [%1], r12 \n"
924 "vaddl.u8 q0, d0, d1 \n"
925 "vaddl.u8 q1, d2, d3 \n"
926 "vaddl.u8 q2, d4, d5 \n"
927 "vaddl.u8 q3, d6, d7 \n"
928 "vswp.8 d1, d2 \n" // ab_cd -> ac_bd
929 "vswp.8 d5, d6 \n" // ef_gh -> eg_fh
930 "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d)
931 "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h)
932 "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
933 "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
934 "subs %3, %3, #4 \n" // 4 pixels per loop.
935 MEMACCESS(2)
936 "vst1.8 {q0}, [%2]! \n"
937 "bgt 1b \n"
938 : "+r"(src_argb), // %0
939 "+r"(src_stride), // %1
940 "+r"(dst_argb), // %2
941 "+r"(dst_width) // %3
942 : "r"(src_stepx) // %4
943 : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
944 );
945 }
946
947 // clang-format off
948 // TODO(Yang Zhang): Investigate less load instructions for
949 // the x/dx stepping
950 #define LOAD1_DATA32_LANE(dn, n) \
951 "lsr %5, %3, #16 \n" \
952 "add %6, %1, %5, lsl #2 \n" \
953 "add %3, %3, %4 \n" \
954 MEMACCESS(6) \
955 "vld1.32 {" #dn "[" #n "]}, [%6] \n"
956 // clang-format on
957
ScaleARGBCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)958 void ScaleARGBCols_NEON(uint8* dst_argb,
959 const uint8* src_argb,
960 int dst_width,
961 int x,
962 int dx) {
963 int tmp;
964 const uint8* src_tmp = src_argb;
965 asm volatile (
966 "1: \n"
967 LOAD1_DATA32_LANE(d0, 0)
968 LOAD1_DATA32_LANE(d0, 1)
969 LOAD1_DATA32_LANE(d1, 0)
970 LOAD1_DATA32_LANE(d1, 1)
971 LOAD1_DATA32_LANE(d2, 0)
972 LOAD1_DATA32_LANE(d2, 1)
973 LOAD1_DATA32_LANE(d3, 0)
974 LOAD1_DATA32_LANE(d3, 1)
975
976 MEMACCESS(0)
977 "vst1.32 {q0, q1}, [%0]! \n" // store pixels
978 "subs %2, %2, #8 \n" // 8 processed per loop
979 "bgt 1b \n"
980 : "+r"(dst_argb), // %0
981 "+r"(src_argb), // %1
982 "+r"(dst_width), // %2
983 "+r"(x), // %3
984 "+r"(dx), // %4
985 "=&r"(tmp), // %5
986 "+r"(src_tmp) // %6
987 :
988 : "memory", "cc", "q0", "q1"
989 );
990 }
991
992 #undef LOAD1_DATA32_LANE
993
994 // clang-format off
995 // TODO(Yang Zhang): Investigate less load instructions for
996 // the x/dx stepping
997 #define LOAD2_DATA32_LANE(dn1, dn2, n) \
998 "lsr %5, %3, #16 \n" \
999 "add %6, %1, %5, lsl #2 \n" \
1000 "add %3, %3, %4 \n" \
1001 MEMACCESS(6) \
1002 "vld2.32 {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
1003 // clang-format on
1004
ScaleARGBFilterCols_NEON(uint8 * dst_argb,const uint8 * src_argb,int dst_width,int x,int dx)1005 void ScaleARGBFilterCols_NEON(uint8* dst_argb,
1006 const uint8* src_argb,
1007 int dst_width,
1008 int x,
1009 int dx) {
1010 int dx_offset[4] = {0, 1, 2, 3};
1011 int* tmp = dx_offset;
1012 const uint8* src_tmp = src_argb;
1013 asm volatile (
1014 "vdup.32 q0, %3 \n" // x
1015 "vdup.32 q1, %4 \n" // dx
1016 "vld1.32 {q2}, [%5] \n" // 0 1 2 3
1017 "vshl.i32 q9, q1, #2 \n" // 4 * dx
1018 "vmul.s32 q1, q1, q2 \n"
1019 "vmov.i8 q3, #0x7f \n" // 0x7F
1020 "vmov.i16 q15, #0x7f \n" // 0x7F
1021 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
1022 "vadd.s32 q8, q1, q0 \n"
1023 "1: \n"
1024 // d0, d1: a
1025 // d2, d3: b
1026 LOAD2_DATA32_LANE(d0, d2, 0)
1027 LOAD2_DATA32_LANE(d0, d2, 1)
1028 LOAD2_DATA32_LANE(d1, d3, 0)
1029 LOAD2_DATA32_LANE(d1, d3, 1)
1030 "vshrn.i32 d22, q8, #9 \n"
1031 "vand.16 d22, d22, d30 \n"
1032 "vdup.8 d24, d22[0] \n"
1033 "vdup.8 d25, d22[2] \n"
1034 "vdup.8 d26, d22[4] \n"
1035 "vdup.8 d27, d22[6] \n"
1036 "vext.8 d4, d24, d25, #4 \n"
1037 "vext.8 d5, d26, d27, #4 \n" // f
1038 "veor.8 q10, q2, q3 \n" // 0x7f ^ f
1039 "vmull.u8 q11, d0, d20 \n"
1040 "vmull.u8 q12, d1, d21 \n"
1041 "vmull.u8 q13, d2, d4 \n"
1042 "vmull.u8 q14, d3, d5 \n"
1043 "vadd.i16 q11, q11, q13 \n"
1044 "vadd.i16 q12, q12, q14 \n"
1045 "vshrn.i16 d0, q11, #7 \n"
1046 "vshrn.i16 d1, q12, #7 \n"
1047
1048 MEMACCESS(0)
1049 "vst1.32 {d0, d1}, [%0]! \n" // store pixels
1050 "vadd.s32 q8, q8, q9 \n"
1051 "subs %2, %2, #4 \n" // 4 processed per loop
1052 "bgt 1b \n"
1053 : "+r"(dst_argb), // %0
1054 "+r"(src_argb), // %1
1055 "+r"(dst_width), // %2
1056 "+r"(x), // %3
1057 "+r"(dx), // %4
1058 "+r"(tmp), // %5
1059 "+r"(src_tmp) // %6
1060 :
1061 : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9",
1062 "q10", "q11", "q12", "q13", "q14", "q15"
1063 );
1064 }
1065
1066 #undef LOAD2_DATA32_LANE
1067
1068 #endif // defined(__ARM_NEON__) && !defined(__aarch64__)
1069
1070 #ifdef __cplusplus
1071 } // extern "C"
1072 } // namespace libyuv
1073 #endif
1074