1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/vp9_common.h"
13 
14 void vp9_idct16x16_256_add_neon_pass1(const int16_t *input,
15                                       int16_t *output,
16                                       int output_stride);
17 void vp9_idct16x16_256_add_neon_pass2(const int16_t *src,
18                                       int16_t *output,
19                                       int16_t *pass1Output,
20                                       int16_t skip_adding,
21                                       uint8_t *dest,
22                                       int dest_stride);
23 void vp9_idct16x16_10_add_neon_pass1(const int16_t *input,
24                                      int16_t *output,
25                                      int output_stride);
26 void vp9_idct16x16_10_add_neon_pass2(const int16_t *src,
27                                      int16_t *output,
28                                      int16_t *pass1Output,
29                                      int16_t skip_adding,
30                                      uint8_t *dest,
31                                      int dest_stride);
32 
33 /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
34 extern void vp9_push_neon(int64_t *store);
35 extern void vp9_pop_neon(int64_t *store);
36 
vp9_idct16x16_256_add_neon(const int16_t * input,uint8_t * dest,int dest_stride)37 void vp9_idct16x16_256_add_neon(const int16_t *input,
38                                 uint8_t *dest, int dest_stride) {
39   int64_t store_reg[8];
40   int16_t pass1_output[16*16] = {0};
41   int16_t row_idct_output[16*16] = {0};
42 
43   // save d8-d15 register values.
44   vp9_push_neon(store_reg);
45 
46   /* Parallel idct on the upper 8 rows */
47   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
48   // stage 6 result in pass1_output.
49   vp9_idct16x16_256_add_neon_pass1(input, pass1_output, 8);
50 
51   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
52   // with result in pass1(pass1_output) to calculate final result in stage 7
53   // which will be saved into row_idct_output.
54   vp9_idct16x16_256_add_neon_pass2(input+1,
55                                      row_idct_output,
56                                      pass1_output,
57                                      0,
58                                      dest,
59                                      dest_stride);
60 
61   /* Parallel idct on the lower 8 rows */
62   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
63   // stage 6 result in pass1_output.
64   vp9_idct16x16_256_add_neon_pass1(input+8*16, pass1_output, 8);
65 
66   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
67   // with result in pass1(pass1_output) to calculate final result in stage 7
68   // which will be saved into row_idct_output.
69   vp9_idct16x16_256_add_neon_pass2(input+8*16+1,
70                                      row_idct_output+8,
71                                      pass1_output,
72                                      0,
73                                      dest,
74                                      dest_stride);
75 
76   /* Parallel idct on the left 8 columns */
77   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
78   // stage 6 result in pass1_output.
79   vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
80 
81   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
82   // with result in pass1(pass1_output) to calculate final result in stage 7.
83   // Then add the result to the destination data.
84   vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
85                                      row_idct_output,
86                                      pass1_output,
87                                      1,
88                                      dest,
89                                      dest_stride);
90 
91   /* Parallel idct on the right 8 columns */
92   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
93   // stage 6 result in pass1_output.
94   vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
95 
96   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
97   // with result in pass1(pass1_output) to calculate final result in stage 7.
98   // Then add the result to the destination data.
99   vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
100                                      row_idct_output+8,
101                                      pass1_output,
102                                      1,
103                                      dest+8,
104                                      dest_stride);
105 
106   // restore d8-d15 register values.
107   vp9_pop_neon(store_reg);
108 
109   return;
110 }
111 
vp9_idct16x16_10_add_neon(const int16_t * input,uint8_t * dest,int dest_stride)112 void vp9_idct16x16_10_add_neon(const int16_t *input,
113                                uint8_t *dest, int dest_stride) {
114   int64_t store_reg[8];
115   int16_t pass1_output[16*16] = {0};
116   int16_t row_idct_output[16*16] = {0};
117 
118   // save d8-d15 register values.
119   vp9_push_neon(store_reg);
120 
121   /* Parallel idct on the upper 8 rows */
122   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
123   // stage 6 result in pass1_output.
124   vp9_idct16x16_10_add_neon_pass1(input, pass1_output, 8);
125 
126   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
127   // with result in pass1(pass1_output) to calculate final result in stage 7
128   // which will be saved into row_idct_output.
129   vp9_idct16x16_10_add_neon_pass2(input+1,
130                                         row_idct_output,
131                                         pass1_output,
132                                         0,
133                                         dest,
134                                         dest_stride);
135 
136   /* Skip Parallel idct on the lower 8 rows as they are all 0s */
137 
138   /* Parallel idct on the left 8 columns */
139   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
140   // stage 6 result in pass1_output.
141   vp9_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8);
142 
143   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
144   // with result in pass1(pass1_output) to calculate final result in stage 7.
145   // Then add the result to the destination data.
146   vp9_idct16x16_256_add_neon_pass2(row_idct_output+1,
147                                      row_idct_output,
148                                      pass1_output,
149                                      1,
150                                      dest,
151                                      dest_stride);
152 
153   /* Parallel idct on the right 8 columns */
154   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
155   // stage 6 result in pass1_output.
156   vp9_idct16x16_256_add_neon_pass1(row_idct_output+8*16, pass1_output, 8);
157 
158   // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
159   // with result in pass1(pass1_output) to calculate final result in stage 7.
160   // Then add the result to the destination data.
161   vp9_idct16x16_256_add_neon_pass2(row_idct_output+8*16+1,
162                                      row_idct_output+8,
163                                      pass1_output,
164                                      1,
165                                      dest+8,
166                                      dest_stride);
167 
168   // restore d8-d15 register values.
169   vp9_pop_neon(store_reg);
170 
171   return;
172 }
173