1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vp8_rtcd.h"
14 
idct_dequant_0_2x_neon(int16_t * q,int16_t dq,unsigned char * dst,int stride)15 static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
16                                    int stride) {
17   unsigned char *dst0;
18   int i, a0, a1;
19   int16x8x2_t q2Add;
20   int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
21   uint8x8_t d2u8, d4u8;
22   uint16x8_t q1u16, q2u16;
23 
24   a0 = ((q[0] * dq) + 4) >> 3;
25   a1 = ((q[16] * dq) + 4) >> 3;
26   q[0] = q[16] = 0;
27   q2Add.val[0] = vdupq_n_s16((int16_t)a0);
28   q2Add.val[1] = vdupq_n_s16((int16_t)a1);
29 
30   for (i = 0; i < 2; i++, dst += 4) {
31     dst0 = dst;
32     d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
33     dst0 += stride;
34     d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
35     dst0 += stride;
36     d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
37     dst0 += stride;
38     d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
39 
40     q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
41                      vreinterpret_u8_s32(d2s32));
42     q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
43                      vreinterpret_u8_s32(d4s32));
44 
45     d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
46     d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
47 
48     d2s32 = vreinterpret_s32_u8(d2u8);
49     d4s32 = vreinterpret_s32_u8(d4u8);
50 
51     dst0 = dst;
52     vst1_lane_s32((int32_t *)dst0, d2s32, 0);
53     dst0 += stride;
54     vst1_lane_s32((int32_t *)dst0, d2s32, 1);
55     dst0 += stride;
56     vst1_lane_s32((int32_t *)dst0, d4s32, 0);
57     dst0 += stride;
58     vst1_lane_s32((int32_t *)dst0, d4s32, 1);
59   }
60 }
61 
62 static const int16_t cospi8sqrt2minus1 = 20091;
63 static const int16_t sinpi8sqrt2 = 17734;
64 // because the lowest bit in 0x8a8c is 0, we can pre-shift this
65 
idct_dequant_full_2x_neon(int16_t * q,int16_t * dq,unsigned char * dst,int stride)66 static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq,
67                                       unsigned char *dst, int stride) {
68   unsigned char *dst0, *dst1;
69   int32x2_t d28, d29, d30, d31;
70   int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
71   int16x8_t qEmpty = vdupq_n_s16(0);
72   int32x4x2_t q2tmp0, q2tmp1;
73   int16x8x2_t q2tmp2, q2tmp3;
74   int16x4_t dLow0, dLow1, dHigh0, dHigh1;
75 
76   d28 = d29 = d30 = d31 = vdup_n_s32(0);
77 
78   // load dq
79   q0 = vld1q_s16(dq);
80   dq += 8;
81   q1 = vld1q_s16(dq);
82 
83   // load q
84   q2 = vld1q_s16(q);
85   vst1q_s16(q, qEmpty);
86   q += 8;
87   q3 = vld1q_s16(q);
88   vst1q_s16(q, qEmpty);
89   q += 8;
90   q4 = vld1q_s16(q);
91   vst1q_s16(q, qEmpty);
92   q += 8;
93   q5 = vld1q_s16(q);
94   vst1q_s16(q, qEmpty);
95 
96   // load src from dst
97   dst0 = dst;
98   dst1 = dst + 4;
99   d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
100   dst0 += stride;
101   d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
102   dst1 += stride;
103   d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
104   dst0 += stride;
105   d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
106   dst1 += stride;
107 
108   d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
109   dst0 += stride;
110   d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
111   dst1 += stride;
112   d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
113   d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
114 
115   q2 = vmulq_s16(q2, q0);
116   q3 = vmulq_s16(q3, q1);
117   q4 = vmulq_s16(q4, q0);
118   q5 = vmulq_s16(q5, q1);
119 
120   // vswp
121   dLow0 = vget_low_s16(q2);
122   dHigh0 = vget_high_s16(q2);
123   dLow1 = vget_low_s16(q4);
124   dHigh1 = vget_high_s16(q4);
125   q2 = vcombine_s16(dLow0, dLow1);
126   q4 = vcombine_s16(dHigh0, dHigh1);
127 
128   dLow0 = vget_low_s16(q3);
129   dHigh0 = vget_high_s16(q3);
130   dLow1 = vget_low_s16(q5);
131   dHigh1 = vget_high_s16(q5);
132   q3 = vcombine_s16(dLow0, dLow1);
133   q5 = vcombine_s16(dHigh0, dHigh1);
134 
135   q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
136   q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
137   q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
138   q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
139 
140   q10 = vqaddq_s16(q2, q3);
141   q11 = vqsubq_s16(q2, q3);
142 
143   q8 = vshrq_n_s16(q8, 1);
144   q9 = vshrq_n_s16(q9, 1);
145 
146   q4 = vqaddq_s16(q4, q8);
147   q5 = vqaddq_s16(q5, q9);
148 
149   q2 = vqsubq_s16(q6, q5);
150   q3 = vqaddq_s16(q7, q4);
151 
152   q4 = vqaddq_s16(q10, q3);
153   q5 = vqaddq_s16(q11, q2);
154   q6 = vqsubq_s16(q11, q2);
155   q7 = vqsubq_s16(q10, q3);
156 
157   q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
158   q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
159   q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
160                      vreinterpretq_s16_s32(q2tmp1.val[0]));
161   q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
162                      vreinterpretq_s16_s32(q2tmp1.val[1]));
163 
164   // loop 2
165   q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
166   q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
167   q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
168   q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
169 
170   q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
171   q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
172 
173   q10 = vshrq_n_s16(q10, 1);
174   q11 = vshrq_n_s16(q11, 1);
175 
176   q10 = vqaddq_s16(q2tmp2.val[1], q10);
177   q11 = vqaddq_s16(q2tmp3.val[1], q11);
178 
179   q8 = vqsubq_s16(q8, q11);
180   q9 = vqaddq_s16(q9, q10);
181 
182   q4 = vqaddq_s16(q2, q9);
183   q5 = vqaddq_s16(q3, q8);
184   q6 = vqsubq_s16(q3, q8);
185   q7 = vqsubq_s16(q2, q9);
186 
187   q4 = vrshrq_n_s16(q4, 3);
188   q5 = vrshrq_n_s16(q5, 3);
189   q6 = vrshrq_n_s16(q6, 3);
190   q7 = vrshrq_n_s16(q7, 3);
191 
192   q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
193   q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
194   q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
195                      vreinterpretq_s16_s32(q2tmp1.val[0]));
196   q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
197                      vreinterpretq_s16_s32(q2tmp1.val[1]));
198 
199   q4 = vreinterpretq_s16_u16(
200       vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
201   q5 = vreinterpretq_s16_u16(
202       vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
203   q6 = vreinterpretq_s16_u16(
204       vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
205   q7 = vreinterpretq_s16_u16(
206       vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
207 
208   d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
209   d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
210   d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
211   d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
212 
213   dst0 = dst;
214   dst1 = dst + 4;
215   vst1_lane_s32((int32_t *)dst0, d28, 0);
216   dst0 += stride;
217   vst1_lane_s32((int32_t *)dst1, d28, 1);
218   dst1 += stride;
219   vst1_lane_s32((int32_t *)dst0, d29, 0);
220   dst0 += stride;
221   vst1_lane_s32((int32_t *)dst1, d29, 1);
222   dst1 += stride;
223 
224   vst1_lane_s32((int32_t *)dst0, d30, 0);
225   dst0 += stride;
226   vst1_lane_s32((int32_t *)dst1, d30, 1);
227   dst1 += stride;
228   vst1_lane_s32((int32_t *)dst0, d31, 0);
229   vst1_lane_s32((int32_t *)dst1, d31, 1);
230 }
231 
vp8_dequant_idct_add_y_block_neon(short * q,short * dq,unsigned char * dst,int stride,char * eobs)232 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
233                                        int stride, char *eobs) {
234   int i;
235 
236   for (i = 0; i < 4; ++i) {
237     if (((short *)(eobs))[0]) {
238       if (((short *)eobs)[0] & 0xfefe)
239         idct_dequant_full_2x_neon(q, dq, dst, stride);
240       else
241         idct_dequant_0_2x_neon(q, dq[0], dst, stride);
242     }
243 
244     if (((short *)(eobs))[1]) {
245       if (((short *)eobs)[1] & 0xfefe)
246         idct_dequant_full_2x_neon(q + 32, dq, dst + 8, stride);
247       else
248         idct_dequant_0_2x_neon(q + 32, dq[0], dst + 8, stride);
249     }
250     q += 64;
251     dst += 4 * stride;
252     eobs += 4;
253   }
254 }
255 
vp8_dequant_idct_add_uv_block_neon(short * q,short * dq,unsigned char * dst_u,unsigned char * dst_v,int stride,char * eobs)256 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
257                                         unsigned char *dst_u,
258                                         unsigned char *dst_v, int stride,
259                                         char *eobs) {
260   if (((short *)(eobs))[0]) {
261     if (((short *)eobs)[0] & 0xfefe)
262       idct_dequant_full_2x_neon(q, dq, dst_u, stride);
263     else
264       idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
265   }
266 
267   q += 32;
268   dst_u += 4 * stride;
269 
270   if (((short *)(eobs))[1]) {
271     if (((short *)eobs)[1] & 0xfefe)
272       idct_dequant_full_2x_neon(q, dq, dst_u, stride);
273     else
274       idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
275   }
276 
277   q += 32;
278 
279   if (((short *)(eobs))[2]) {
280     if (((short *)eobs)[2] & 0xfefe)
281       idct_dequant_full_2x_neon(q, dq, dst_v, stride);
282     else
283       idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
284   }
285 
286   q += 32;
287   dst_v += 4 * stride;
288 
289   if (((short *)(eobs))[3]) {
290     if (((short *)eobs)[3] & 0xfefe)
291       idct_dequant_full_2x_neon(q, dq, dst_v, stride);
292     else
293       idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
294   }
295 }
296