1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/arm/idct_neon.h"
15 #include "vpx_dsp/inv_txfm.h"
16
idct16x16_1_add_pos_kernel(uint8_t ** dest,const int stride,const uint8x16_t res)17 static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride,
18 const uint8x16_t res) {
19 const uint8x16_t a = vld1q_u8(*dest);
20 const uint8x16_t b = vqaddq_u8(a, res);
21 vst1q_u8(*dest, b);
22 *dest += stride;
23 }
24
idct16x16_1_add_neg_kernel(uint8_t ** dest,const int stride,const uint8x16_t res)25 static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
26 const uint8x16_t res) {
27 const uint8x16_t a = vld1q_u8(*dest);
28 const uint8x16_t b = vqsubq_u8(a, res);
29 vst1q_u8(*dest, b);
30 *dest += stride;
31 }
32
vpx_idct16x16_1_add_neon(const tran_low_t * input,uint8_t * dest,int stride)33 void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
34 int stride) {
35 const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
36 const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
37 const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
38
39 if (a1 >= 0) {
40 const uint8x16_t dc = create_dcq(a1);
41 idct16x16_1_add_pos_kernel(&dest, stride, dc);
42 idct16x16_1_add_pos_kernel(&dest, stride, dc);
43 idct16x16_1_add_pos_kernel(&dest, stride, dc);
44 idct16x16_1_add_pos_kernel(&dest, stride, dc);
45 idct16x16_1_add_pos_kernel(&dest, stride, dc);
46 idct16x16_1_add_pos_kernel(&dest, stride, dc);
47 idct16x16_1_add_pos_kernel(&dest, stride, dc);
48 idct16x16_1_add_pos_kernel(&dest, stride, dc);
49 idct16x16_1_add_pos_kernel(&dest, stride, dc);
50 idct16x16_1_add_pos_kernel(&dest, stride, dc);
51 idct16x16_1_add_pos_kernel(&dest, stride, dc);
52 idct16x16_1_add_pos_kernel(&dest, stride, dc);
53 idct16x16_1_add_pos_kernel(&dest, stride, dc);
54 idct16x16_1_add_pos_kernel(&dest, stride, dc);
55 idct16x16_1_add_pos_kernel(&dest, stride, dc);
56 idct16x16_1_add_pos_kernel(&dest, stride, dc);
57 } else {
58 const uint8x16_t dc = create_dcq(-a1);
59 idct16x16_1_add_neg_kernel(&dest, stride, dc);
60 idct16x16_1_add_neg_kernel(&dest, stride, dc);
61 idct16x16_1_add_neg_kernel(&dest, stride, dc);
62 idct16x16_1_add_neg_kernel(&dest, stride, dc);
63 idct16x16_1_add_neg_kernel(&dest, stride, dc);
64 idct16x16_1_add_neg_kernel(&dest, stride, dc);
65 idct16x16_1_add_neg_kernel(&dest, stride, dc);
66 idct16x16_1_add_neg_kernel(&dest, stride, dc);
67 idct16x16_1_add_neg_kernel(&dest, stride, dc);
68 idct16x16_1_add_neg_kernel(&dest, stride, dc);
69 idct16x16_1_add_neg_kernel(&dest, stride, dc);
70 idct16x16_1_add_neg_kernel(&dest, stride, dc);
71 idct16x16_1_add_neg_kernel(&dest, stride, dc);
72 idct16x16_1_add_neg_kernel(&dest, stride, dc);
73 idct16x16_1_add_neg_kernel(&dest, stride, dc);
74 idct16x16_1_add_neg_kernel(&dest, stride, dc);
75 }
76 }
77