1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //  NEON common code.
11 
12 #ifndef WEBP_DSP_NEON_H_
13 #define WEBP_DSP_NEON_H_
14 
15 #include <arm_neon.h>
16 
17 #include "./dsp.h"
18 
19 // Right now, some intrinsics functions seem slower, so we disable them
20 // everywhere except aarch64 where the inline assembly is incompatible.
21 #if defined(__aarch64__)
22 #define WEBP_USE_INTRINSICS   // use intrinsics when possible
23 #endif
24 
25 #define INIT_VECTOR2(v, a, b) do {  \
26   v.val[0] = a;                     \
27   v.val[1] = b;                     \
28 } while (0)
29 
30 #define INIT_VECTOR3(v, a, b, c) do {  \
31   v.val[0] = a;                        \
32   v.val[1] = b;                        \
33   v.val[2] = c;                        \
34 } while (0)
35 
36 #define INIT_VECTOR4(v, a, b, c, d) do {  \
37   v.val[0] = a;                           \
38   v.val[1] = b;                           \
39   v.val[2] = c;                           \
40   v.val[3] = d;                           \
41 } while (0)
42 
43 // if using intrinsics, this flag avoids some functions that make gcc-4.6.3
44 // crash ("internal compiler error: in immed_double_const, at emit-rtl.").
45 // (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
46 #if !(LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
47 #define WORK_AROUND_GCC
48 #endif
49 
Transpose4x4(const int32x4x4_t rows)50 static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
51   uint64x2x2_t row01, row23;
52 
53   row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
54   row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
55   row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
56   row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
57   // Transpose 64-bit values (there's no vswp equivalent)
58   {
59     const uint64x1_t row0h = vget_high_u64(row01.val[0]);
60     const uint64x1_t row2l = vget_low_u64(row23.val[0]);
61     const uint64x1_t row1h = vget_high_u64(row01.val[1]);
62     const uint64x1_t row3l = vget_low_u64(row23.val[1]);
63     row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
64     row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
65     row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
66     row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
67   }
68   {
69     const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
70                                         vreinterpretq_s32_u64(row01.val[1]));
71     const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
72                                         vreinterpretq_s32_u64(row23.val[1]));
73     int32x4x4_t out;
74     out.val[0] = out01.val[0];
75     out.val[1] = out01.val[1];
76     out.val[2] = out23.val[0];
77     out.val[3] = out23.val[1];
78     return out;
79   }
80 }
81 
82 #if 0     // Useful debug macro.
83 #include <stdio.h>
84 #define PRINT_REG(REG, SIZE) do {                       \
85   int i;                                                \
86   printf("%s \t[%d]: 0x", #REG, SIZE);                  \
87   if (SIZE == 8) {                                      \
88     uint8_t _tmp[8];                                    \
89     vst1_u8(_tmp, (REG));                               \
90     for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]);   \
91   } else if (SIZE == 16) {                              \
92     uint16_t _tmp[4];                                   \
93     vst1_u16(_tmp, (REG));                              \
94     for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]);   \
95   }                                                     \
96   printf("\n");                                         \
97 } while (0)
98 #endif
99 
100 #endif  // WEBP_DSP_NEON_H_
101