1 /*
2 * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/vp9_idct.h"
13 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
14 #include "vpx_dsp/x86/inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/transpose_sse2.h"
16 #include "vpx_dsp/x86/txfm_common_sse2.h"
17
highbd_iadst4_sse4_1(__m128i * const io)18 static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
19 const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
20 const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
21 const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
22 const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
23 __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
24 __m128i temp[2];
25
26 transpose_32bit_4x4(io, io);
27
28 extend_64bit(io[0], temp);
29 s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
30 s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
31 s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
32 s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
33
34 extend_64bit(io[1], temp);
35 s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
36 s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
37
38 extend_64bit(io[2], temp);
39 s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
40 s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
41 s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
42 s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
43
44 extend_64bit(io[3], temp);
45 s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
46 s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
47 s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
48 s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
49
50 t0[0] = _mm_add_epi64(s0[0], s3[0]);
51 t0[1] = _mm_add_epi64(s0[1], s3[1]);
52 t0[0] = _mm_add_epi64(t0[0], s5[0]);
53 t0[1] = _mm_add_epi64(t0[1], s5[1]);
54 t1[0] = _mm_sub_epi64(s1[0], s4[0]);
55 t1[1] = _mm_sub_epi64(s1[1], s4[1]);
56 t1[0] = _mm_sub_epi64(t1[0], s6[0]);
57 t1[1] = _mm_sub_epi64(t1[1], s6[1]);
58 temp[0] = _mm_sub_epi32(io[0], io[2]);
59 temp[0] = _mm_add_epi32(temp[0], io[3]);
60 extend_64bit(temp[0], temp);
61 t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
62 t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
63
64 s0[0] = _mm_add_epi64(t0[0], s2[0]);
65 s0[1] = _mm_add_epi64(t0[1], s2[1]);
66 s1[0] = _mm_add_epi64(t1[0], s2[0]);
67 s1[1] = _mm_add_epi64(t1[1], s2[1]);
68 s3[0] = _mm_add_epi64(t0[0], t1[0]);
69 s3[1] = _mm_add_epi64(t0[1], t1[1]);
70 s3[0] = _mm_sub_epi64(s3[0], s2[0]);
71 s3[1] = _mm_sub_epi64(s3[1], s2[1]);
72
73 s0[0] = dct_const_round_shift_64bit(s0[0]);
74 s0[1] = dct_const_round_shift_64bit(s0[1]);
75 s1[0] = dct_const_round_shift_64bit(s1[0]);
76 s1[1] = dct_const_round_shift_64bit(s1[1]);
77 s2[0] = dct_const_round_shift_64bit(t2[0]);
78 s2[1] = dct_const_round_shift_64bit(t2[1]);
79 s3[0] = dct_const_round_shift_64bit(s3[0]);
80 s3[1] = dct_const_round_shift_64bit(s3[1]);
81 io[0] = pack_4(s0[0], s0[1]);
82 io[1] = pack_4(s1[0], s1[1]);
83 io[2] = pack_4(s2[0], s2[1]);
84 io[3] = pack_4(s3[0], s3[1]);
85 }
86
vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int tx_type,int bd)87 void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
88 int stride, int tx_type, int bd) {
89 __m128i io[4];
90
91 io[0] = _mm_load_si128((const __m128i *)(input + 0));
92 io[1] = _mm_load_si128((const __m128i *)(input + 4));
93 io[2] = _mm_load_si128((const __m128i *)(input + 8));
94 io[3] = _mm_load_si128((const __m128i *)(input + 12));
95
96 if (bd == 8) {
97 __m128i io_short[2];
98
99 io_short[0] = _mm_packs_epi32(io[0], io[1]);
100 io_short[1] = _mm_packs_epi32(io[2], io[3]);
101 if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
102 idct4_sse2(io_short);
103 } else {
104 iadst4_sse2(io_short);
105 }
106 if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
107 idct4_sse2(io_short);
108 } else {
109 iadst4_sse2(io_short);
110 }
111 io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
112 io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
113 io[0] = _mm_srai_epi16(io_short[0], 4);
114 io[1] = _mm_srai_epi16(io_short[1], 4);
115 } else {
116 if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
117 highbd_idct4_sse4_1(io);
118 } else {
119 highbd_iadst4_sse4_1(io);
120 }
121 if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
122 highbd_idct4_sse4_1(io);
123 } else {
124 highbd_iadst4_sse4_1(io);
125 }
126 io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
127 io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
128 }
129
130 recon_and_store_4x4(io, dest, stride, bd);
131 }
132