1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "./vpx_config.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vpx_dsp/arm/sum_neon.h"
17 
get_lane(const int32x2_t a)18 static INLINE tran_low_t get_lane(const int32x2_t a) {
19 #if CONFIG_VP9_HIGHBITDEPTH
20   return vget_lane_s32(a, 0);
21 #else
22   return vget_lane_s16(vreinterpret_s16_s32(a), 0);
23 #endif  // CONFIG_VP9_HIGHBITDETPH
24 }
25 
vpx_fdct4x4_1_neon(const int16_t * input,tran_low_t * output,int stride)26 void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
27   int16x4_t a0, a1, a2, a3;
28   int16x8_t b0, b1;
29   int16x8_t c;
30   int32x2_t d;
31 
32   a0 = vld1_s16(input);
33   input += stride;
34   a1 = vld1_s16(input);
35   input += stride;
36   a2 = vld1_s16(input);
37   input += stride;
38   a3 = vld1_s16(input);
39 
40   b0 = vcombine_s16(a0, a1);
41   b1 = vcombine_s16(a2, a3);
42 
43   c = vaddq_s16(b0, b1);
44 
45   d = horizontal_add_int16x8(c);
46 
47   output[0] = get_lane(vshl_n_s32(d, 1));
48   output[1] = 0;
49 }
50 
vpx_fdct8x8_1_neon(const int16_t * input,tran_low_t * output,int stride)51 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
52   int r;
53   int16x8_t sum = vld1q_s16(&input[0]);
54 
55   for (r = 1; r < 8; ++r) {
56     const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
57     sum = vaddq_s16(sum, input_00);
58   }
59 
60   output[0] = get_lane(horizontal_add_int16x8(sum));
61   output[1] = 0;
62 }
63 
vpx_fdct16x16_1_neon(const int16_t * input,tran_low_t * output,int stride)64 void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
65                           int stride) {
66   int r;
67   int16x8_t left = vld1q_s16(input);
68   int16x8_t right = vld1q_s16(input + 8);
69   int32x2_t sum;
70   input += stride;
71 
72   for (r = 1; r < 16; ++r) {
73     const int16x8_t a = vld1q_s16(input);
74     const int16x8_t b = vld1q_s16(input + 8);
75     input += stride;
76     left = vaddq_s16(left, a);
77     right = vaddq_s16(right, b);
78   }
79 
80   sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
81 
82   output[0] = get_lane(vshr_n_s32(sum, 1));
83   output[1] = 0;
84 }
85 
vpx_fdct32x32_1_neon(const int16_t * input,tran_low_t * output,int stride)86 void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
87                           int stride) {
88   int r;
89   int16x8_t a0 = vld1q_s16(input);
90   int16x8_t a1 = vld1q_s16(input + 8);
91   int16x8_t a2 = vld1q_s16(input + 16);
92   int16x8_t a3 = vld1q_s16(input + 24);
93   int32x2_t sum;
94   input += stride;
95 
96   for (r = 1; r < 32; ++r) {
97     const int16x8_t b0 = vld1q_s16(input);
98     const int16x8_t b1 = vld1q_s16(input + 8);
99     const int16x8_t b2 = vld1q_s16(input + 16);
100     const int16x8_t b3 = vld1q_s16(input + 24);
101     input += stride;
102     a0 = vaddq_s16(a0, b0);
103     a1 = vaddq_s16(a1, b1);
104     a2 = vaddq_s16(a2, b2);
105     a3 = vaddq_s16(a3, b3);
106   }
107 
108   sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
109   sum = vadd_s32(sum, horizontal_add_int16x8(a2));
110   sum = vadd_s32(sum, horizontal_add_int16x8(a3));
111   output[0] = get_lane(vshr_n_s32(sum, 3));
112   output[1] = 0;
113 }
114