1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vp8_rtcd.h"
14 
idct_dequant_0_2x_neon(int16_t * q,int16_t dq,unsigned char * dst,int stride)15 static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
16                                    int stride) {
17   unsigned char *dst0;
18   int i, a0, a1;
19   int16x8x2_t q2Add;
20   int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
21   uint8x8_t d2u8, d4u8;
22   uint16x8_t q1u16, q2u16;
23 
24   a0 = ((q[0] * dq) + 4) >> 3;
25   a1 = ((q[16] * dq) + 4) >> 3;
26   q[0] = q[16] = 0;
27   q2Add.val[0] = vdupq_n_s16((int16_t)a0);
28   q2Add.val[1] = vdupq_n_s16((int16_t)a1);
29 
30   for (i = 0; i < 2; i++, dst += 4) {
31     dst0 = dst;
32     d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
33     dst0 += stride;
34     d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
35     dst0 += stride;
36     d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
37     dst0 += stride;
38     d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
39 
40     q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
41                      vreinterpret_u8_s32(d2s32));
42     q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
43                      vreinterpret_u8_s32(d4s32));
44 
45     d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
46     d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
47 
48     d2s32 = vreinterpret_s32_u8(d2u8);
49     d4s32 = vreinterpret_s32_u8(d4u8);
50 
51     dst0 = dst;
52     vst1_lane_s32((int32_t *)dst0, d2s32, 0);
53     dst0 += stride;
54     vst1_lane_s32((int32_t *)dst0, d2s32, 1);
55     dst0 += stride;
56     vst1_lane_s32((int32_t *)dst0, d4s32, 0);
57     dst0 += stride;
58     vst1_lane_s32((int32_t *)dst0, d4s32, 1);
59   }
60   return;
61 }
62 
63 static const int16_t cospi8sqrt2minus1 = 20091;
64 static const int16_t sinpi8sqrt2 = 17734;
65 // because the lowest bit in 0x8a8c is 0, we can pre-shift this
66 
idct_dequant_full_2x_neon(int16_t * q,int16_t * dq,unsigned char * dst,int stride)67 static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq,
68                                       unsigned char *dst, int stride) {
69   unsigned char *dst0, *dst1;
70   int32x2_t d28, d29, d30, d31;
71   int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
72   int16x8_t qEmpty = vdupq_n_s16(0);
73   int32x4x2_t q2tmp0, q2tmp1;
74   int16x8x2_t q2tmp2, q2tmp3;
75   int16x4_t dLow0, dLow1, dHigh0, dHigh1;
76 
77   d28 = d29 = d30 = d31 = vdup_n_s32(0);
78 
79   // load dq
80   q0 = vld1q_s16(dq);
81   dq += 8;
82   q1 = vld1q_s16(dq);
83 
84   // load q
85   q2 = vld1q_s16(q);
86   vst1q_s16(q, qEmpty);
87   q += 8;
88   q3 = vld1q_s16(q);
89   vst1q_s16(q, qEmpty);
90   q += 8;
91   q4 = vld1q_s16(q);
92   vst1q_s16(q, qEmpty);
93   q += 8;
94   q5 = vld1q_s16(q);
95   vst1q_s16(q, qEmpty);
96 
97   // load src from dst
98   dst0 = dst;
99   dst1 = dst + 4;
100   d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
101   dst0 += stride;
102   d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
103   dst1 += stride;
104   d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
105   dst0 += stride;
106   d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
107   dst1 += stride;
108 
109   d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
110   dst0 += stride;
111   d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
112   dst1 += stride;
113   d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
114   d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
115 
116   q2 = vmulq_s16(q2, q0);
117   q3 = vmulq_s16(q3, q1);
118   q4 = vmulq_s16(q4, q0);
119   q5 = vmulq_s16(q5, q1);
120 
121   // vswp
122   dLow0 = vget_low_s16(q2);
123   dHigh0 = vget_high_s16(q2);
124   dLow1 = vget_low_s16(q4);
125   dHigh1 = vget_high_s16(q4);
126   q2 = vcombine_s16(dLow0, dLow1);
127   q4 = vcombine_s16(dHigh0, dHigh1);
128 
129   dLow0 = vget_low_s16(q3);
130   dHigh0 = vget_high_s16(q3);
131   dLow1 = vget_low_s16(q5);
132   dHigh1 = vget_high_s16(q5);
133   q3 = vcombine_s16(dLow0, dLow1);
134   q5 = vcombine_s16(dHigh0, dHigh1);
135 
136   q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
137   q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
138   q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
139   q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
140 
141   q10 = vqaddq_s16(q2, q3);
142   q11 = vqsubq_s16(q2, q3);
143 
144   q8 = vshrq_n_s16(q8, 1);
145   q9 = vshrq_n_s16(q9, 1);
146 
147   q4 = vqaddq_s16(q4, q8);
148   q5 = vqaddq_s16(q5, q9);
149 
150   q2 = vqsubq_s16(q6, q5);
151   q3 = vqaddq_s16(q7, q4);
152 
153   q4 = vqaddq_s16(q10, q3);
154   q5 = vqaddq_s16(q11, q2);
155   q6 = vqsubq_s16(q11, q2);
156   q7 = vqsubq_s16(q10, q3);
157 
158   q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
159   q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
160   q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
161                      vreinterpretq_s16_s32(q2tmp1.val[0]));
162   q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
163                      vreinterpretq_s16_s32(q2tmp1.val[1]));
164 
165   // loop 2
166   q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
167   q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
168   q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
169   q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
170 
171   q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
172   q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
173 
174   q10 = vshrq_n_s16(q10, 1);
175   q11 = vshrq_n_s16(q11, 1);
176 
177   q10 = vqaddq_s16(q2tmp2.val[1], q10);
178   q11 = vqaddq_s16(q2tmp3.val[1], q11);
179 
180   q8 = vqsubq_s16(q8, q11);
181   q9 = vqaddq_s16(q9, q10);
182 
183   q4 = vqaddq_s16(q2, q9);
184   q5 = vqaddq_s16(q3, q8);
185   q6 = vqsubq_s16(q3, q8);
186   q7 = vqsubq_s16(q2, q9);
187 
188   q4 = vrshrq_n_s16(q4, 3);
189   q5 = vrshrq_n_s16(q5, 3);
190   q6 = vrshrq_n_s16(q6, 3);
191   q7 = vrshrq_n_s16(q7, 3);
192 
193   q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
194   q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
195   q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
196                      vreinterpretq_s16_s32(q2tmp1.val[0]));
197   q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
198                      vreinterpretq_s16_s32(q2tmp1.val[1]));
199 
200   q4 = vreinterpretq_s16_u16(
201       vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
202   q5 = vreinterpretq_s16_u16(
203       vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
204   q6 = vreinterpretq_s16_u16(
205       vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
206   q7 = vreinterpretq_s16_u16(
207       vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
208 
209   d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
210   d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
211   d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
212   d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
213 
214   dst0 = dst;
215   dst1 = dst + 4;
216   vst1_lane_s32((int32_t *)dst0, d28, 0);
217   dst0 += stride;
218   vst1_lane_s32((int32_t *)dst1, d28, 1);
219   dst1 += stride;
220   vst1_lane_s32((int32_t *)dst0, d29, 0);
221   dst0 += stride;
222   vst1_lane_s32((int32_t *)dst1, d29, 1);
223   dst1 += stride;
224 
225   vst1_lane_s32((int32_t *)dst0, d30, 0);
226   dst0 += stride;
227   vst1_lane_s32((int32_t *)dst1, d30, 1);
228   dst1 += stride;
229   vst1_lane_s32((int32_t *)dst0, d31, 0);
230   vst1_lane_s32((int32_t *)dst1, d31, 1);
231   return;
232 }
233 
vp8_dequant_idct_add_y_block_neon(short * q,short * dq,unsigned char * dst,int stride,char * eobs)234 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
235                                        int stride, char *eobs) {
236   int i;
237 
238   for (i = 0; i < 4; ++i) {
239     if (((short *)(eobs))[0]) {
240       if (((short *)eobs)[0] & 0xfefe)
241         idct_dequant_full_2x_neon(q, dq, dst, stride);
242       else
243         idct_dequant_0_2x_neon(q, dq[0], dst, stride);
244     }
245 
246     if (((short *)(eobs))[1]) {
247       if (((short *)eobs)[1] & 0xfefe)
248         idct_dequant_full_2x_neon(q + 32, dq, dst + 8, stride);
249       else
250         idct_dequant_0_2x_neon(q + 32, dq[0], dst + 8, stride);
251     }
252     q += 64;
253     dst += 4 * stride;
254     eobs += 4;
255   }
256 }
257 
vp8_dequant_idct_add_uv_block_neon(short * q,short * dq,unsigned char * dst_u,unsigned char * dst_v,int stride,char * eobs)258 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
259                                         unsigned char *dst_u,
260                                         unsigned char *dst_v, int stride,
261                                         char *eobs) {
262   if (((short *)(eobs))[0]) {
263     if (((short *)eobs)[0] & 0xfefe)
264       idct_dequant_full_2x_neon(q, dq, dst_u, stride);
265     else
266       idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
267   }
268 
269   q += 32;
270   dst_u += 4 * stride;
271 
272   if (((short *)(eobs))[1]) {
273     if (((short *)eobs)[1] & 0xfefe)
274       idct_dequant_full_2x_neon(q, dq, dst_u, stride);
275     else
276       idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
277   }
278 
279   q += 32;
280 
281   if (((short *)(eobs))[2]) {
282     if (((short *)eobs)[2] & 0xfefe)
283       idct_dequant_full_2x_neon(q, dq, dst_v, stride);
284     else
285       idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
286   }
287 
288   q += 32;
289   dst_v += 4 * stride;
290 
291   if (((short *)(eobs))[3]) {
292     if (((short *)eobs)[3] & 0xfefe)
293       idct_dequant_full_2x_neon(q, dq, dst_v, stride);
294     else
295       idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
296   }
297 }
298