1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12 #include "libyuv/scale.h"
13 #include "libyuv/scale_row.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 // This module is for GCC Neon armv8 64 bit.
21 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
22
23 // Read 32x1 throw away even pixels, and write 16x1.
ScaleRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)24 void ScaleRowDown2_NEON(const uint8_t* src_ptr,
25 ptrdiff_t src_stride,
26 uint8_t* dst,
27 int dst_width) {
28 (void)src_stride;
29 asm volatile(
30 "1: \n"
31 // load even pixels into v0, odd into v1
32 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
33 "subs %w2, %w2, #16 \n" // 16 processed per loop
34 "st1 {v1.16b}, [%1], #16 \n" // store odd pixels
35 "b.gt 1b \n"
36 : "+r"(src_ptr), // %0
37 "+r"(dst), // %1
38 "+r"(dst_width) // %2
39 :
40 : "v0", "v1" // Clobber List
41 );
42 }
43
44 // Read 32x1 average down and write 16x1.
ScaleRowDown2Linear_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)45 void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
46 ptrdiff_t src_stride,
47 uint8_t* dst,
48 int dst_width) {
49 (void)src_stride;
50 asm volatile(
51 "1: \n"
52 // load even pixels into v0, odd into v1
53 "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
54 "subs %w2, %w2, #16 \n" // 16 processed per loop
55 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
56 "st1 {v0.16b}, [%1], #16 \n"
57 "b.gt 1b \n"
58 : "+r"(src_ptr), // %0
59 "+r"(dst), // %1
60 "+r"(dst_width) // %2
61 :
62 : "v0", "v1" // Clobber List
63 );
64 }
65
66 // Read 32x2 average down and write 16x1.
ScaleRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)67 void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
68 ptrdiff_t src_stride,
69 uint8_t* dst,
70 int dst_width) {
71 asm volatile(
72 // change the stride to row 2 pointer
73 "add %1, %1, %0 \n"
74 "1: \n"
75 "ld1 {v0.16b, v1.16b}, [%0], #32 \n" // load row 1 and post inc
76 "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc
77 "subs %w3, %w3, #16 \n" // 16 processed per loop
78 "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent
79 "uaddlp v1.8h, v1.16b \n"
80 "uadalp v0.8h, v2.16b \n" // += row 2 add adjacent
81 "uadalp v1.8h, v3.16b \n"
82 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
83 "rshrn2 v0.16b, v1.8h, #2 \n"
84 "st1 {v0.16b}, [%2], #16 \n"
85 "b.gt 1b \n"
86 : "+r"(src_ptr), // %0
87 "+r"(src_stride), // %1
88 "+r"(dst), // %2
89 "+r"(dst_width) // %3
90 :
91 : "v0", "v1", "v2", "v3" // Clobber List
92 );
93 }
94
ScaleRowDown4_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)95 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
96 ptrdiff_t src_stride,
97 uint8_t* dst_ptr,
98 int dst_width) {
99 (void)src_stride;
100 asm volatile(
101 "1: \n"
102 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
103 "subs %w2, %w2, #8 \n" // 8 processed per loop
104 "st1 {v2.8b}, [%1], #8 \n"
105 "b.gt 1b \n"
106 : "+r"(src_ptr), // %0
107 "+r"(dst_ptr), // %1
108 "+r"(dst_width) // %2
109 :
110 : "v0", "v1", "v2", "v3", "memory", "cc");
111 }
112
ScaleRowDown4Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)113 void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
114 ptrdiff_t src_stride,
115 uint8_t* dst_ptr,
116 int dst_width) {
117 const uint8_t* src_ptr1 = src_ptr + src_stride;
118 const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
119 const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
120 asm volatile(
121 "1: \n"
122 "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4
123 "ld1 {v1.16b}, [%2], #16 \n"
124 "ld1 {v2.16b}, [%3], #16 \n"
125 "ld1 {v3.16b}, [%4], #16 \n"
126 "subs %w5, %w5, #4 \n"
127 "uaddlp v0.8h, v0.16b \n"
128 "uadalp v0.8h, v1.16b \n"
129 "uadalp v0.8h, v2.16b \n"
130 "uadalp v0.8h, v3.16b \n"
131 "addp v0.8h, v0.8h, v0.8h \n"
132 "rshrn v0.8b, v0.8h, #4 \n" // divide by 16 w/rounding
133 "st1 {v0.s}[0], [%1], #4 \n"
134 "b.gt 1b \n"
135 : "+r"(src_ptr), // %0
136 "+r"(dst_ptr), // %1
137 "+r"(src_ptr1), // %2
138 "+r"(src_ptr2), // %3
139 "+r"(src_ptr3), // %4
140 "+r"(dst_width) // %5
141 :
142 : "v0", "v1", "v2", "v3", "memory", "cc");
143 }
144
145 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
146 // to load up the every 4th pixel into a 4 different registers.
147 // Point samples 32 pixels to 24 pixels.
ScaleRowDown34_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)148 void ScaleRowDown34_NEON(const uint8_t* src_ptr,
149 ptrdiff_t src_stride,
150 uint8_t* dst_ptr,
151 int dst_width) {
152 (void)src_stride;
153 asm volatile(
154 "1: \n"
155 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
156 "subs %w2, %w2, #24 \n"
157 "orr v2.16b, v3.16b, v3.16b \n" // order v0,v1,v2
158 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
159 "b.gt 1b \n"
160 : "+r"(src_ptr), // %0
161 "+r"(dst_ptr), // %1
162 "+r"(dst_width) // %2
163 :
164 : "v0", "v1", "v2", "v3", "memory", "cc");
165 }
166
ScaleRowDown34_0_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)167 void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
168 ptrdiff_t src_stride,
169 uint8_t* dst_ptr,
170 int dst_width) {
171 asm volatile(
172 "movi v20.8b, #3 \n"
173 "add %3, %3, %0 \n"
174 "1: \n"
175 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
176 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
177 "subs %w2, %w2, #24 \n"
178
179 // filter src line 0 with src line 1
180 // expand chars to shorts to allow for room
181 // when adding lines together
182 "ushll v16.8h, v4.8b, #0 \n"
183 "ushll v17.8h, v5.8b, #0 \n"
184 "ushll v18.8h, v6.8b, #0 \n"
185 "ushll v19.8h, v7.8b, #0 \n"
186
187 // 3 * line_0 + line_1
188 "umlal v16.8h, v0.8b, v20.8b \n"
189 "umlal v17.8h, v1.8b, v20.8b \n"
190 "umlal v18.8h, v2.8b, v20.8b \n"
191 "umlal v19.8h, v3.8b, v20.8b \n"
192
193 // (3 * line_0 + line_1) >> 2
194 "uqrshrn v0.8b, v16.8h, #2 \n"
195 "uqrshrn v1.8b, v17.8h, #2 \n"
196 "uqrshrn v2.8b, v18.8h, #2 \n"
197 "uqrshrn v3.8b, v19.8h, #2 \n"
198
199 // a0 = (src[0] * 3 + s[1] * 1) >> 2
200 "ushll v16.8h, v1.8b, #0 \n"
201 "umlal v16.8h, v0.8b, v20.8b \n"
202 "uqrshrn v0.8b, v16.8h, #2 \n"
203
204 // a1 = (src[1] * 1 + s[2] * 1) >> 1
205 "urhadd v1.8b, v1.8b, v2.8b \n"
206
207 // a2 = (src[2] * 1 + s[3] * 3) >> 2
208 "ushll v16.8h, v2.8b, #0 \n"
209 "umlal v16.8h, v3.8b, v20.8b \n"
210 "uqrshrn v2.8b, v16.8h, #2 \n"
211
212 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
213
214 "b.gt 1b \n"
215 : "+r"(src_ptr), // %0
216 "+r"(dst_ptr), // %1
217 "+r"(dst_width), // %2
218 "+r"(src_stride) // %3
219 :
220 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
221 "v19", "v20", "memory", "cc");
222 }
223
ScaleRowDown34_1_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)224 void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
225 ptrdiff_t src_stride,
226 uint8_t* dst_ptr,
227 int dst_width) {
228 asm volatile(
229 "movi v20.8b, #3 \n"
230 "add %3, %3, %0 \n"
231 "1: \n"
232 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0
233 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1
234 "subs %w2, %w2, #24 \n"
235 // average src line 0 with src line 1
236 "urhadd v0.8b, v0.8b, v4.8b \n"
237 "urhadd v1.8b, v1.8b, v5.8b \n"
238 "urhadd v2.8b, v2.8b, v6.8b \n"
239 "urhadd v3.8b, v3.8b, v7.8b \n"
240
241 // a0 = (src[0] * 3 + s[1] * 1) >> 2
242 "ushll v4.8h, v1.8b, #0 \n"
243 "umlal v4.8h, v0.8b, v20.8b \n"
244 "uqrshrn v0.8b, v4.8h, #2 \n"
245
246 // a1 = (src[1] * 1 + s[2] * 1) >> 1
247 "urhadd v1.8b, v1.8b, v2.8b \n"
248
249 // a2 = (src[2] * 1 + s[3] * 3) >> 2
250 "ushll v4.8h, v2.8b, #0 \n"
251 "umlal v4.8h, v3.8b, v20.8b \n"
252 "uqrshrn v2.8b, v4.8h, #2 \n"
253
254 "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n"
255 "b.gt 1b \n"
256 : "+r"(src_ptr), // %0
257 "+r"(dst_ptr), // %1
258 "+r"(dst_width), // %2
259 "+r"(src_stride) // %3
260 :
261 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
262 }
263
264 static const uvec8 kShuf38 = {0, 3, 6, 8, 11, 14, 16, 19,
265 22, 24, 27, 30, 0, 0, 0, 0};
266 static const uvec8 kShuf38_2 = {0, 16, 32, 2, 18, 33, 4, 20,
267 34, 6, 22, 35, 0, 0, 0, 0};
268 static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
269 65536 / 12, 65536 / 12, 65536 / 12,
270 65536 / 12, 65536 / 12};
271 static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
272 65536 / 18, 65536 / 18, 65536 / 18,
273 65536 / 18, 65536 / 18};
274
275 // 32 -> 12
ScaleRowDown38_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)276 void ScaleRowDown38_NEON(const uint8_t* src_ptr,
277 ptrdiff_t src_stride,
278 uint8_t* dst_ptr,
279 int dst_width) {
280 (void)src_stride;
281 asm volatile(
282 "ld1 {v3.16b}, [%3] \n"
283 "1: \n"
284 "ld1 {v0.16b,v1.16b}, [%0], #32 \n"
285 "subs %w2, %w2, #12 \n"
286 "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n"
287 "st1 {v2.8b}, [%1], #8 \n"
288 "st1 {v2.s}[2], [%1], #4 \n"
289 "b.gt 1b \n"
290 : "+r"(src_ptr), // %0
291 "+r"(dst_ptr), // %1
292 "+r"(dst_width) // %2
293 : "r"(&kShuf38) // %3
294 : "v0", "v1", "v2", "v3", "memory", "cc");
295 }
296
297 // 32x3 -> 12x1
ScaleRowDown38_3_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)298 void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
299 ptrdiff_t src_stride,
300 uint8_t* dst_ptr,
301 int dst_width) {
302 const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
303 ptrdiff_t tmp_src_stride = src_stride;
304
305 asm volatile(
306 "ld1 {v29.8h}, [%5] \n"
307 "ld1 {v30.16b}, [%6] \n"
308 "ld1 {v31.8h}, [%7] \n"
309 "add %2, %2, %0 \n"
310 "1: \n"
311
312 // 00 40 01 41 02 42 03 43
313 // 10 50 11 51 12 52 13 53
314 // 20 60 21 61 22 62 23 63
315 // 30 70 31 71 32 72 33 73
316 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
317 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
318 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n"
319 "subs %w4, %w4, #12 \n"
320
321 // Shuffle the input data around to get align the data
322 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
323 // 00 10 01 11 02 12 03 13
324 // 40 50 41 51 42 52 43 53
325 "trn1 v20.8b, v0.8b, v1.8b \n"
326 "trn2 v21.8b, v0.8b, v1.8b \n"
327 "trn1 v22.8b, v4.8b, v5.8b \n"
328 "trn2 v23.8b, v4.8b, v5.8b \n"
329 "trn1 v24.8b, v16.8b, v17.8b \n"
330 "trn2 v25.8b, v16.8b, v17.8b \n"
331
332 // 20 30 21 31 22 32 23 33
333 // 60 70 61 71 62 72 63 73
334 "trn1 v0.8b, v2.8b, v3.8b \n"
335 "trn2 v1.8b, v2.8b, v3.8b \n"
336 "trn1 v4.8b, v6.8b, v7.8b \n"
337 "trn2 v5.8b, v6.8b, v7.8b \n"
338 "trn1 v16.8b, v18.8b, v19.8b \n"
339 "trn2 v17.8b, v18.8b, v19.8b \n"
340
341 // 00+10 01+11 02+12 03+13
342 // 40+50 41+51 42+52 43+53
343 "uaddlp v20.4h, v20.8b \n"
344 "uaddlp v21.4h, v21.8b \n"
345 "uaddlp v22.4h, v22.8b \n"
346 "uaddlp v23.4h, v23.8b \n"
347 "uaddlp v24.4h, v24.8b \n"
348 "uaddlp v25.4h, v25.8b \n"
349
350 // 60+70 61+71 62+72 63+73
351 "uaddlp v1.4h, v1.8b \n"
352 "uaddlp v5.4h, v5.8b \n"
353 "uaddlp v17.4h, v17.8b \n"
354
355 // combine source lines
356 "add v20.4h, v20.4h, v22.4h \n"
357 "add v21.4h, v21.4h, v23.4h \n"
358 "add v20.4h, v20.4h, v24.4h \n"
359 "add v21.4h, v21.4h, v25.4h \n"
360 "add v2.4h, v1.4h, v5.4h \n"
361 "add v2.4h, v2.4h, v17.4h \n"
362
363 // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
364 // + s[6 + st * 1] + s[7 + st * 1]
365 // + s[6 + st * 2] + s[7 + st * 2]) / 6
366 "sqrdmulh v2.8h, v2.8h, v29.8h \n"
367 "xtn v2.8b, v2.8h \n"
368
369 // Shuffle 2,3 reg around so that 2 can be added to the
370 // 0,1 reg and 3 can be added to the 4,5 reg. This
371 // requires expanding from u8 to u16 as the 0,1 and 4,5
372 // registers are already expanded. Then do transposes
373 // to get aligned.
374 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
375 "ushll v16.8h, v16.8b, #0 \n"
376 "uaddl v0.8h, v0.8b, v4.8b \n"
377
378 // combine source lines
379 "add v0.8h, v0.8h, v16.8h \n"
380
381 // xx 20 xx 21 xx 22 xx 23
382 // xx 30 xx 31 xx 32 xx 33
383 "trn1 v1.8h, v0.8h, v0.8h \n"
384 "trn2 v4.8h, v0.8h, v0.8h \n"
385 "xtn v0.4h, v1.4s \n"
386 "xtn v4.4h, v4.4s \n"
387
388 // 0+1+2, 3+4+5
389 "add v20.8h, v20.8h, v0.8h \n"
390 "add v21.8h, v21.8h, v4.8h \n"
391
392 // Need to divide, but can't downshift as the the value
393 // isn't a power of 2. So multiply by 65536 / n
394 // and take the upper 16 bits.
395 "sqrdmulh v0.8h, v20.8h, v31.8h \n"
396 "sqrdmulh v1.8h, v21.8h, v31.8h \n"
397
398 // Align for table lookup, vtbl requires registers to be adjacent
399 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
400
401 "st1 {v3.8b}, [%1], #8 \n"
402 "st1 {v3.s}[2], [%1], #4 \n"
403 "b.gt 1b \n"
404 : "+r"(src_ptr), // %0
405 "+r"(dst_ptr), // %1
406 "+r"(tmp_src_stride), // %2
407 "+r"(src_ptr1), // %3
408 "+r"(dst_width) // %4
409 : "r"(&kMult38_Div6), // %5
410 "r"(&kShuf38_2), // %6
411 "r"(&kMult38_Div9) // %7
412 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
413 "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
414 "memory", "cc");
415 }
416
417 // 32x2 -> 12x1
ScaleRowDown38_2_Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)418 void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
419 ptrdiff_t src_stride,
420 uint8_t* dst_ptr,
421 int dst_width) {
422 // TODO(fbarchard): use src_stride directly for clang 3.5+.
423 ptrdiff_t tmp_src_stride = src_stride;
424 asm volatile(
425 "ld1 {v30.8h}, [%4] \n"
426 "ld1 {v31.16b}, [%5] \n"
427 "add %2, %2, %0 \n"
428 "1: \n"
429
430 // 00 40 01 41 02 42 03 43
431 // 10 50 11 51 12 52 13 53
432 // 20 60 21 61 22 62 23 63
433 // 30 70 31 71 32 72 33 73
434 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"
435 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n"
436 "subs %w3, %w3, #12 \n"
437
438 // Shuffle the input data around to get align the data
439 // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
440 // 00 10 01 11 02 12 03 13
441 // 40 50 41 51 42 52 43 53
442 "trn1 v16.8b, v0.8b, v1.8b \n"
443 "trn2 v17.8b, v0.8b, v1.8b \n"
444 "trn1 v18.8b, v4.8b, v5.8b \n"
445 "trn2 v19.8b, v4.8b, v5.8b \n"
446
447 // 20 30 21 31 22 32 23 33
448 // 60 70 61 71 62 72 63 73
449 "trn1 v0.8b, v2.8b, v3.8b \n"
450 "trn2 v1.8b, v2.8b, v3.8b \n"
451 "trn1 v4.8b, v6.8b, v7.8b \n"
452 "trn2 v5.8b, v6.8b, v7.8b \n"
453
454 // 00+10 01+11 02+12 03+13
455 // 40+50 41+51 42+52 43+53
456 "uaddlp v16.4h, v16.8b \n"
457 "uaddlp v17.4h, v17.8b \n"
458 "uaddlp v18.4h, v18.8b \n"
459 "uaddlp v19.4h, v19.8b \n"
460
461 // 60+70 61+71 62+72 63+73
462 "uaddlp v1.4h, v1.8b \n"
463 "uaddlp v5.4h, v5.8b \n"
464
465 // combine source lines
466 "add v16.4h, v16.4h, v18.4h \n"
467 "add v17.4h, v17.4h, v19.4h \n"
468 "add v2.4h, v1.4h, v5.4h \n"
469
470 // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
471 "uqrshrn v2.8b, v2.8h, #2 \n"
472
473 // Shuffle 2,3 reg around so that 2 can be added to the
474 // 0,1 reg and 3 can be added to the 4,5 reg. This
475 // requires expanding from u8 to u16 as the 0,1 and 4,5
476 // registers are already expanded. Then do transposes
477 // to get aligned.
478 // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
479
480 // combine source lines
481 "uaddl v0.8h, v0.8b, v4.8b \n"
482
483 // xx 20 xx 21 xx 22 xx 23
484 // xx 30 xx 31 xx 32 xx 33
485 "trn1 v1.8h, v0.8h, v0.8h \n"
486 "trn2 v4.8h, v0.8h, v0.8h \n"
487 "xtn v0.4h, v1.4s \n"
488 "xtn v4.4h, v4.4s \n"
489
490 // 0+1+2, 3+4+5
491 "add v16.8h, v16.8h, v0.8h \n"
492 "add v17.8h, v17.8h, v4.8h \n"
493
494 // Need to divide, but can't downshift as the the value
495 // isn't a power of 2. So multiply by 65536 / n
496 // and take the upper 16 bits.
497 "sqrdmulh v0.8h, v16.8h, v30.8h \n"
498 "sqrdmulh v1.8h, v17.8h, v30.8h \n"
499
500 // Align for table lookup, vtbl requires registers to
501 // be adjacent
502
503 "tbl v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
504
505 "st1 {v3.8b}, [%1], #8 \n"
506 "st1 {v3.s}[2], [%1], #4 \n"
507 "b.gt 1b \n"
508 : "+r"(src_ptr), // %0
509 "+r"(dst_ptr), // %1
510 "+r"(tmp_src_stride), // %2
511 "+r"(dst_width) // %3
512 : "r"(&kMult38_Div6), // %4
513 "r"(&kShuf38_2) // %5
514 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
515 "v19", "v30", "v31", "memory", "cc");
516 }
517
ScaleAddRows_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst_ptr,int src_width,int src_height)518 void ScaleAddRows_NEON(const uint8_t* src_ptr,
519 ptrdiff_t src_stride,
520 uint16_t* dst_ptr,
521 int src_width,
522 int src_height) {
523 const uint8_t* src_tmp;
524 asm volatile(
525 "1: \n"
526 "mov %0, %1 \n"
527 "mov w12, %w5 \n"
528 "eor v2.16b, v2.16b, v2.16b \n"
529 "eor v3.16b, v3.16b, v3.16b \n"
530 "2: \n"
531 // load 16 pixels into q0
532 "ld1 {v0.16b}, [%0], %3 \n"
533 "uaddw2 v3.8h, v3.8h, v0.16b \n"
534 "uaddw v2.8h, v2.8h, v0.8b \n"
535 "subs w12, w12, #1 \n"
536 "b.gt 2b \n"
537 "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels
538 "add %1, %1, #16 \n"
539 "subs %w4, %w4, #16 \n" // 16 processed per loop
540 "b.gt 1b \n"
541 : "=&r"(src_tmp), // %0
542 "+r"(src_ptr), // %1
543 "+r"(dst_ptr), // %2
544 "+r"(src_stride), // %3
545 "+r"(src_width), // %4
546 "+r"(src_height) // %5
547 :
548 : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List
549 );
550 }
551
552 // TODO(Yang Zhang): Investigate less load instructions for
553 // the x/dx stepping
554 #define LOAD2_DATA8_LANE(n) \
555 "lsr %5, %3, #16 \n" \
556 "add %6, %1, %5 \n" \
557 "add %3, %3, %4 \n" \
558 "ld2 {v4.b, v5.b}[" #n "], [%6] \n"
559
560 // The NEON version mimics this formula (from row_common.cc):
561 // #define BLENDER(a, b, f) (uint8_t)((int)(a) +
562 // ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
563
ScaleFilterCols_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,int dst_width,int x,int dx)564 void ScaleFilterCols_NEON(uint8_t* dst_ptr,
565 const uint8_t* src_ptr,
566 int dst_width,
567 int x,
568 int dx) {
569 int dx_offset[4] = {0, 1, 2, 3};
570 int* tmp = dx_offset;
571 const uint8_t* src_tmp = src_ptr;
572 int64_t x64 = (int64_t)x; // NOLINT
573 int64_t dx64 = (int64_t)dx; // NOLINT
574 asm volatile (
575 "dup v0.4s, %w3 \n" // x
576 "dup v1.4s, %w4 \n" // dx
577 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
578 "shl v3.4s, v1.4s, #2 \n" // 4 * dx
579 "mul v1.4s, v1.4s, v2.4s \n"
580 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
581 "add v1.4s, v1.4s, v0.4s \n"
582 // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx
583 "add v2.4s, v1.4s, v3.4s \n"
584 "shl v0.4s, v3.4s, #1 \n" // 8 * dx
585 "1: \n"
586 LOAD2_DATA8_LANE(0)
587 LOAD2_DATA8_LANE(1)
588 LOAD2_DATA8_LANE(2)
589 LOAD2_DATA8_LANE(3)
590 LOAD2_DATA8_LANE(4)
591 LOAD2_DATA8_LANE(5)
592 LOAD2_DATA8_LANE(6)
593 LOAD2_DATA8_LANE(7)
594 "mov v6.16b, v1.16b \n"
595 "mov v7.16b, v2.16b \n"
596 "uzp1 v6.8h, v6.8h, v7.8h \n"
597 "ushll v4.8h, v4.8b, #0 \n"
598 "ushll v5.8h, v5.8b, #0 \n"
599 "ssubl v16.4s, v5.4h, v4.4h \n"
600 "ssubl2 v17.4s, v5.8h, v4.8h \n"
601 "ushll v7.4s, v6.4h, #0 \n"
602 "ushll2 v6.4s, v6.8h, #0 \n"
603 "mul v16.4s, v16.4s, v7.4s \n"
604 "mul v17.4s, v17.4s, v6.4s \n"
605 "rshrn v6.4h, v16.4s, #16 \n"
606 "rshrn2 v6.8h, v17.4s, #16 \n"
607 "add v4.8h, v4.8h, v6.8h \n"
608 "xtn v4.8b, v4.8h \n"
609
610 "st1 {v4.8b}, [%0], #8 \n" // store pixels
611 "add v1.4s, v1.4s, v0.4s \n"
612 "add v2.4s, v2.4s, v0.4s \n"
613 "subs %w2, %w2, #8 \n" // 8 processed per loop
614 "b.gt 1b \n"
615 : "+r"(dst_ptr), // %0
616 "+r"(src_ptr), // %1
617 "+r"(dst_width), // %2
618 "+r"(x64), // %3
619 "+r"(dx64), // %4
620 "+r"(tmp), // %5
621 "+r"(src_tmp) // %6
622 :
623 : "memory", "cc", "v0", "v1", "v2", "v3",
624 "v4", "v5", "v6", "v7", "v16", "v17"
625 );
626 }
627
628 #undef LOAD2_DATA8_LANE
629
630 // 16x2 -> 16x1
ScaleFilterRows_NEON(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)631 void ScaleFilterRows_NEON(uint8_t* dst_ptr,
632 const uint8_t* src_ptr,
633 ptrdiff_t src_stride,
634 int dst_width,
635 int source_y_fraction) {
636 int y_fraction = 256 - source_y_fraction;
637 asm volatile(
638 "cmp %w4, #0 \n"
639 "b.eq 100f \n"
640 "add %2, %2, %1 \n"
641 "cmp %w4, #64 \n"
642 "b.eq 75f \n"
643 "cmp %w4, #128 \n"
644 "b.eq 50f \n"
645 "cmp %w4, #192 \n"
646 "b.eq 25f \n"
647
648 "dup v5.8b, %w4 \n"
649 "dup v4.8b, %w5 \n"
650 // General purpose row blend.
651 "1: \n"
652 "ld1 {v0.16b}, [%1], #16 \n"
653 "ld1 {v1.16b}, [%2], #16 \n"
654 "subs %w3, %w3, #16 \n"
655 "umull v6.8h, v0.8b, v4.8b \n"
656 "umull2 v7.8h, v0.16b, v4.16b \n"
657 "umlal v6.8h, v1.8b, v5.8b \n"
658 "umlal2 v7.8h, v1.16b, v5.16b \n"
659 "rshrn v0.8b, v6.8h, #8 \n"
660 "rshrn2 v0.16b, v7.8h, #8 \n"
661 "st1 {v0.16b}, [%0], #16 \n"
662 "b.gt 1b \n"
663 "b 99f \n"
664
665 // Blend 25 / 75.
666 "25: \n"
667 "ld1 {v0.16b}, [%1], #16 \n"
668 "ld1 {v1.16b}, [%2], #16 \n"
669 "subs %w3, %w3, #16 \n"
670 "urhadd v0.16b, v0.16b, v1.16b \n"
671 "urhadd v0.16b, v0.16b, v1.16b \n"
672 "st1 {v0.16b}, [%0], #16 \n"
673 "b.gt 25b \n"
674 "b 99f \n"
675
676 // Blend 50 / 50.
677 "50: \n"
678 "ld1 {v0.16b}, [%1], #16 \n"
679 "ld1 {v1.16b}, [%2], #16 \n"
680 "subs %w3, %w3, #16 \n"
681 "urhadd v0.16b, v0.16b, v1.16b \n"
682 "st1 {v0.16b}, [%0], #16 \n"
683 "b.gt 50b \n"
684 "b 99f \n"
685
686 // Blend 75 / 25.
687 "75: \n"
688 "ld1 {v1.16b}, [%1], #16 \n"
689 "ld1 {v0.16b}, [%2], #16 \n"
690 "subs %w3, %w3, #16 \n"
691 "urhadd v0.16b, v0.16b, v1.16b \n"
692 "urhadd v0.16b, v0.16b, v1.16b \n"
693 "st1 {v0.16b}, [%0], #16 \n"
694 "b.gt 75b \n"
695 "b 99f \n"
696
697 // Blend 100 / 0 - Copy row unchanged.
698 "100: \n"
699 "ld1 {v0.16b}, [%1], #16 \n"
700 "subs %w3, %w3, #16 \n"
701 "st1 {v0.16b}, [%0], #16 \n"
702 "b.gt 100b \n"
703
704 "99: \n"
705 "st1 {v0.b}[15], [%0] \n"
706 : "+r"(dst_ptr), // %0
707 "+r"(src_ptr), // %1
708 "+r"(src_stride), // %2
709 "+r"(dst_width), // %3
710 "+r"(source_y_fraction), // %4
711 "+r"(y_fraction) // %5
712 :
713 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
714 }
715
ScaleARGBRowDown2_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)716 void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
717 ptrdiff_t src_stride,
718 uint8_t* dst,
719 int dst_width) {
720 (void)src_stride;
721 asm volatile(
722 "1: \n"
723 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
724 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
725 "subs %w2, %w2, #8 \n" // 8 processed per loop
726 "mov v2.16b, v3.16b \n"
727 "st2 {v1.4s,v2.4s}, [%1], #32 \n" // store 8 odd pixels
728 "b.gt 1b \n"
729 : "+r"(src_ptr), // %0
730 "+r"(dst), // %1
731 "+r"(dst_width) // %2
732 :
733 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
734 );
735 }
736
ScaleARGBRowDown2Linear_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)737 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
738 ptrdiff_t src_stride,
739 uint8_t* dst_argb,
740 int dst_width) {
741 (void)src_stride;
742 asm volatile(
743 "1: \n"
744 // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
745 "ld4 {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
746 "subs %w2, %w2, #8 \n" // 8 processed per loop
747
748 "urhadd v0.16b, v0.16b, v1.16b \n" // rounding half add
749 "urhadd v1.16b, v2.16b, v3.16b \n"
750 "st2 {v0.4s,v1.4s}, [%1], #32 \n" // store 8 pixels
751 "b.gt 1b \n"
752 : "+r"(src_argb), // %0
753 "+r"(dst_argb), // %1
754 "+r"(dst_width) // %2
755 :
756 : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
757 );
758 }
759
ScaleARGBRowDown2Box_NEON(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)760 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
761 ptrdiff_t src_stride,
762 uint8_t* dst,
763 int dst_width) {
764 asm volatile(
765 // change the stride to row 2 pointer
766 "add %1, %1, %0 \n"
767 "1: \n"
768 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB
769 "subs %w3, %w3, #8 \n" // 8 processed per loop.
770 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
771 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
772 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
773 "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
774 "ld4 {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n" // load 8
775 "uadalp v0.8h, v16.16b \n" // B 16 bytes -> 8 shorts.
776 "uadalp v1.8h, v17.16b \n" // G 16 bytes -> 8 shorts.
777 "uadalp v2.8h, v18.16b \n" // R 16 bytes -> 8 shorts.
778 "uadalp v3.8h, v19.16b \n" // A 16 bytes -> 8 shorts.
779 "rshrn v0.8b, v0.8h, #2 \n" // round and pack
780 "rshrn v1.8b, v1.8h, #2 \n"
781 "rshrn v2.8b, v2.8h, #2 \n"
782 "rshrn v3.8b, v3.8h, #2 \n"
783 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"
784 "b.gt 1b \n"
785 : "+r"(src_ptr), // %0
786 "+r"(src_stride), // %1
787 "+r"(dst), // %2
788 "+r"(dst_width) // %3
789 :
790 : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
791 }
792
793 // Reads 4 pixels at a time.
794 // Alignment requirement: src_argb 4 byte aligned.
ScaleARGBRowDownEven_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)795 void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
796 ptrdiff_t src_stride,
797 int src_stepx,
798 uint8_t* dst_argb,
799 int dst_width) {
800 (void)src_stride;
801 asm volatile(
802 "1: \n"
803 "ld1 {v0.s}[0], [%0], %3 \n"
804 "ld1 {v0.s}[1], [%0], %3 \n"
805 "ld1 {v0.s}[2], [%0], %3 \n"
806 "ld1 {v0.s}[3], [%0], %3 \n"
807 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
808 "st1 {v0.16b}, [%1], #16 \n"
809 "b.gt 1b \n"
810 : "+r"(src_argb), // %0
811 "+r"(dst_argb), // %1
812 "+r"(dst_width) // %2
813 : "r"((int64_t)(src_stepx * 4)) // %3
814 : "memory", "cc", "v0");
815 }
816
817 // Reads 4 pixels at a time.
818 // Alignment requirement: src_argb 4 byte aligned.
819 // TODO(Yang Zhang): Might be worth another optimization pass in future.
820 // It could be upgraded to 8 pixels at a time to start with.
ScaleARGBRowDownEvenBox_NEON(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)821 void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
822 ptrdiff_t src_stride,
823 int src_stepx,
824 uint8_t* dst_argb,
825 int dst_width) {
826 asm volatile(
827 "add %1, %1, %0 \n"
828 "1: \n"
829 "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 -> 2x1
830 "ld1 {v1.8b}, [%1], %4 \n"
831 "ld1 {v2.8b}, [%0], %4 \n"
832 "ld1 {v3.8b}, [%1], %4 \n"
833 "ld1 {v4.8b}, [%0], %4 \n"
834 "ld1 {v5.8b}, [%1], %4 \n"
835 "ld1 {v6.8b}, [%0], %4 \n"
836 "ld1 {v7.8b}, [%1], %4 \n"
837 "uaddl v0.8h, v0.8b, v1.8b \n"
838 "uaddl v2.8h, v2.8b, v3.8b \n"
839 "uaddl v4.8h, v4.8b, v5.8b \n"
840 "uaddl v6.8h, v6.8b, v7.8b \n"
841 "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd
842 "mov v0.d[1], v2.d[0] \n"
843 "mov v2.d[0], v16.d[1] \n"
844 "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh
845 "mov v4.d[1], v6.d[0] \n"
846 "mov v6.d[0], v16.d[1] \n"
847 "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d)
848 "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h)
849 "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels.
850 "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels.
851 "subs %w3, %w3, #4 \n" // 4 pixels per loop.
852 "st1 {v0.16b}, [%2], #16 \n"
853 "b.gt 1b \n"
854 : "+r"(src_argb), // %0
855 "+r"(src_stride), // %1
856 "+r"(dst_argb), // %2
857 "+r"(dst_width) // %3
858 : "r"((int64_t)(src_stepx * 4)) // %4
859 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
860 }
861
862 // TODO(Yang Zhang): Investigate less load instructions for
863 // the x/dx stepping
864 #define LOAD1_DATA32_LANE(vn, n) \
865 "lsr %5, %3, #16 \n" \
866 "add %6, %1, %5, lsl #2 \n" \
867 "add %3, %3, %4 \n" \
868 "ld1 {" #vn ".s}[" #n "], [%6] \n"
869
ScaleARGBCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)870 void ScaleARGBCols_NEON(uint8_t* dst_argb,
871 const uint8_t* src_argb,
872 int dst_width,
873 int x,
874 int dx) {
875 const uint8_t* src_tmp = src_argb;
876 int64_t x64 = (int64_t)x; // NOLINT
877 int64_t dx64 = (int64_t)dx; // NOLINT
878 int64_t tmp64;
879 asm volatile(
880 "1: \n"
881 // clang-format off
882 LOAD1_DATA32_LANE(v0, 0)
883 LOAD1_DATA32_LANE(v0, 1)
884 LOAD1_DATA32_LANE(v0, 2)
885 LOAD1_DATA32_LANE(v0, 3)
886 LOAD1_DATA32_LANE(v1, 0)
887 LOAD1_DATA32_LANE(v1, 1)
888 LOAD1_DATA32_LANE(v1, 2)
889 LOAD1_DATA32_LANE(v1, 3)
890 // clang-format on
891 "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels
892 "subs %w2, %w2, #8 \n" // 8 processed per loop
893 "b.gt 1b \n"
894 : "+r"(dst_argb), // %0
895 "+r"(src_argb), // %1
896 "+r"(dst_width), // %2
897 "+r"(x64), // %3
898 "+r"(dx64), // %4
899 "=&r"(tmp64), // %5
900 "+r"(src_tmp) // %6
901 :
902 : "memory", "cc", "v0", "v1");
903 }
904
905 #undef LOAD1_DATA32_LANE
906
907 // TODO(Yang Zhang): Investigate less load instructions for
908 // the x/dx stepping
909 #define LOAD2_DATA32_LANE(vn1, vn2, n) \
910 "lsr %5, %3, #16 \n" \
911 "add %6, %1, %5, lsl #2 \n" \
912 "add %3, %3, %4 \n" \
913 "ld2 {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6] \n"
914
ScaleARGBFilterCols_NEON(uint8_t * dst_argb,const uint8_t * src_argb,int dst_width,int x,int dx)915 void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
916 const uint8_t* src_argb,
917 int dst_width,
918 int x,
919 int dx) {
920 int dx_offset[4] = {0, 1, 2, 3};
921 int* tmp = dx_offset;
922 const uint8_t* src_tmp = src_argb;
923 int64_t x64 = (int64_t)x; // NOLINT
924 int64_t dx64 = (int64_t)dx; // NOLINT
925 asm volatile (
926 "dup v0.4s, %w3 \n" // x
927 "dup v1.4s, %w4 \n" // dx
928 "ld1 {v2.4s}, [%5] \n" // 0 1 2 3
929 "shl v6.4s, v1.4s, #2 \n" // 4 * dx
930 "mul v1.4s, v1.4s, v2.4s \n"
931 "movi v3.16b, #0x7f \n" // 0x7F
932 "movi v4.8h, #0x7f \n" // 0x7F
933 // x , x + 1 * dx, x + 2 * dx, x + 3 * dx
934 "add v5.4s, v1.4s, v0.4s \n"
935 "1: \n"
936 // d0, d1: a
937 // d2, d3: b
938 LOAD2_DATA32_LANE(v0, v1, 0)
939 LOAD2_DATA32_LANE(v0, v1, 1)
940 LOAD2_DATA32_LANE(v0, v1, 2)
941 LOAD2_DATA32_LANE(v0, v1, 3)
942 "shrn v2.4h, v5.4s, #9 \n"
943 "and v2.8b, v2.8b, v4.8b \n"
944 "dup v16.8b, v2.b[0] \n"
945 "dup v17.8b, v2.b[2] \n"
946 "dup v18.8b, v2.b[4] \n"
947 "dup v19.8b, v2.b[6] \n"
948 "ext v2.8b, v16.8b, v17.8b, #4 \n"
949 "ext v17.8b, v18.8b, v19.8b, #4 \n"
950 "ins v2.d[1], v17.d[0] \n" // f
951 "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f
952 "umull v16.8h, v0.8b, v7.8b \n"
953 "umull2 v17.8h, v0.16b, v7.16b \n"
954 "umull v18.8h, v1.8b, v2.8b \n"
955 "umull2 v19.8h, v1.16b, v2.16b \n"
956 "add v16.8h, v16.8h, v18.8h \n"
957 "add v17.8h, v17.8h, v19.8h \n"
958 "shrn v0.8b, v16.8h, #7 \n"
959 "shrn2 v0.16b, v17.8h, #7 \n"
960
961 "st1 {v0.4s}, [%0], #16 \n" // store pixels
962 "add v5.4s, v5.4s, v6.4s \n"
963 "subs %w2, %w2, #4 \n" // 4 processed per loop
964 "b.gt 1b \n"
965 : "+r"(dst_argb), // %0
966 "+r"(src_argb), // %1
967 "+r"(dst_width), // %2
968 "+r"(x64), // %3
969 "+r"(dx64), // %4
970 "+r"(tmp), // %5
971 "+r"(src_tmp) // %6
972 :
973 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
974 "v6", "v7", "v16", "v17", "v18", "v19"
975 );
976 }
977
978 #undef LOAD2_DATA32_LANE
979
980 // Read 16x2 average down and write 8x1.
ScaleRowDown2Box_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)981 void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
982 ptrdiff_t src_stride,
983 uint16_t* dst,
984 int dst_width) {
985 asm volatile(
986 // change the stride to row 2 pointer
987 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
988 "1: \n"
989 "ld1 {v0.8h, v1.8h}, [%0], #32 \n" // load row 1 and post inc
990 "ld1 {v2.8h, v3.8h}, [%1], #32 \n" // load row 2 and post inc
991 "subs %w3, %w3, #8 \n" // 8 processed per loop
992 "uaddlp v0.4s, v0.8h \n" // row 1 add adjacent
993 "uaddlp v1.4s, v1.8h \n"
994 "uadalp v0.4s, v2.8h \n" // +row 2 add adjacent
995 "uadalp v1.4s, v3.8h \n"
996 "rshrn v0.4h, v0.4s, #2 \n" // round and pack
997 "rshrn2 v0.8h, v1.4s, #2 \n"
998 "st1 {v0.8h}, [%2], #16 \n"
999 "b.gt 1b \n"
1000 : "+r"(src_ptr), // %0
1001 "+r"(src_stride), // %1
1002 "+r"(dst), // %2
1003 "+r"(dst_width) // %3
1004 :
1005 : "v0", "v1", "v2", "v3" // Clobber List
1006 );
1007 }
1008
1009 // Read 8x2 upsample with filtering and write 16x1.
1010 // Actually reads an extra pixel, so 9x2.
ScaleRowUp2_16_NEON(const uint16_t * src_ptr,ptrdiff_t src_stride,uint16_t * dst,int dst_width)1011 void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
1012 ptrdiff_t src_stride,
1013 uint16_t* dst,
1014 int dst_width) {
1015 asm volatile(
1016 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
1017 "movi v0.8h, #9 \n" // constants
1018 "movi v1.4s, #3 \n"
1019
1020 "1: \n"
1021 "ld1 {v3.8h}, [%0], %4 \n" // TL read first 8
1022 "ld1 {v4.8h}, [%0], %5 \n" // TR read 8 offset by 1
1023 "ld1 {v5.8h}, [%1], %4 \n" // BL read 8 from next row
1024 "ld1 {v6.8h}, [%1], %5 \n" // BR offset by 1
1025 "subs %w3, %w3, #16 \n" // 16 dst pixels per loop
1026 "umull v16.4s, v3.4h, v0.4h \n"
1027 "umull2 v7.4s, v3.8h, v0.8h \n"
1028 "umull v18.4s, v4.4h, v0.4h \n"
1029 "umull2 v17.4s, v4.8h, v0.8h \n"
1030 "uaddw v16.4s, v16.4s, v6.4h \n"
1031 "uaddl2 v19.4s, v6.8h, v3.8h \n"
1032 "uaddl v3.4s, v6.4h, v3.4h \n"
1033 "uaddw2 v6.4s, v7.4s, v6.8h \n"
1034 "uaddl2 v7.4s, v5.8h, v4.8h \n"
1035 "uaddl v4.4s, v5.4h, v4.4h \n"
1036 "uaddw v18.4s, v18.4s, v5.4h \n"
1037 "mla v16.4s, v4.4s, v1.4s \n"
1038 "mla v18.4s, v3.4s, v1.4s \n"
1039 "mla v6.4s, v7.4s, v1.4s \n"
1040 "uaddw2 v4.4s, v17.4s, v5.8h \n"
1041 "uqrshrn v16.4h, v16.4s, #4 \n"
1042 "mla v4.4s, v19.4s, v1.4s \n"
1043 "uqrshrn2 v16.8h, v6.4s, #4 \n"
1044 "uqrshrn v17.4h, v18.4s, #4 \n"
1045 "uqrshrn2 v17.8h, v4.4s, #4 \n"
1046 "st2 {v16.8h-v17.8h}, [%2], #32 \n"
1047 "b.gt 1b \n"
1048 : "+r"(src_ptr), // %0
1049 "+r"(src_stride), // %1
1050 "+r"(dst), // %2
1051 "+r"(dst_width) // %3
1052 : "r"(2LL), // %4
1053 "r"(14LL) // %5
1054 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
1055 "v19" // Clobber List
1056 );
1057 }
1058
1059 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
1060
1061 #ifdef __cplusplus
1062 } // extern "C"
1063 } // namespace libyuv
1064 #endif
1065