1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp9_rtcd.h"
12 #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 
vp9_iht4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride,int tx_type)14 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
15                             int tx_type) {
16   __m128i in[2];
17   const __m128i eight = _mm_set1_epi16(8);
18 
19   in[0] = load_input_data8(input);
20   in[1] = load_input_data8(input + 8);
21 
22   switch (tx_type) {
23     case DCT_DCT:
24       idct4_sse2(in);
25       idct4_sse2(in);
26       break;
27     case ADST_DCT:
28       idct4_sse2(in);
29       iadst4_sse2(in);
30       break;
31     case DCT_ADST:
32       iadst4_sse2(in);
33       idct4_sse2(in);
34       break;
35     default:
36       assert(tx_type == ADST_ADST);
37       iadst4_sse2(in);
38       iadst4_sse2(in);
39       break;
40   }
41 
42   // Final round and shift
43   in[0] = _mm_add_epi16(in[0], eight);
44   in[1] = _mm_add_epi16(in[1], eight);
45 
46   in[0] = _mm_srai_epi16(in[0], 4);
47   in[1] = _mm_srai_epi16(in[1], 4);
48 
49   recon_and_store4x4_sse2(in, dest, stride);
50 }
51 
vp9_iht8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride,int tx_type)52 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
53                             int tx_type) {
54   __m128i in[8];
55   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
56 
57   // load input data
58   in[0] = load_input_data8(input);
59   in[1] = load_input_data8(input + 8 * 1);
60   in[2] = load_input_data8(input + 8 * 2);
61   in[3] = load_input_data8(input + 8 * 3);
62   in[4] = load_input_data8(input + 8 * 4);
63   in[5] = load_input_data8(input + 8 * 5);
64   in[6] = load_input_data8(input + 8 * 6);
65   in[7] = load_input_data8(input + 8 * 7);
66 
67   switch (tx_type) {
68     case DCT_DCT:
69       vpx_idct8_sse2(in);
70       vpx_idct8_sse2(in);
71       break;
72     case ADST_DCT:
73       vpx_idct8_sse2(in);
74       iadst8_sse2(in);
75       break;
76     case DCT_ADST:
77       iadst8_sse2(in);
78       vpx_idct8_sse2(in);
79       break;
80     default:
81       assert(tx_type == ADST_ADST);
82       iadst8_sse2(in);
83       iadst8_sse2(in);
84       break;
85   }
86 
87   // Final rounding and shift
88   in[0] = _mm_adds_epi16(in[0], final_rounding);
89   in[1] = _mm_adds_epi16(in[1], final_rounding);
90   in[2] = _mm_adds_epi16(in[2], final_rounding);
91   in[3] = _mm_adds_epi16(in[3], final_rounding);
92   in[4] = _mm_adds_epi16(in[4], final_rounding);
93   in[5] = _mm_adds_epi16(in[5], final_rounding);
94   in[6] = _mm_adds_epi16(in[6], final_rounding);
95   in[7] = _mm_adds_epi16(in[7], final_rounding);
96 
97   in[0] = _mm_srai_epi16(in[0], 5);
98   in[1] = _mm_srai_epi16(in[1], 5);
99   in[2] = _mm_srai_epi16(in[2], 5);
100   in[3] = _mm_srai_epi16(in[3], 5);
101   in[4] = _mm_srai_epi16(in[4], 5);
102   in[5] = _mm_srai_epi16(in[5], 5);
103   in[6] = _mm_srai_epi16(in[6], 5);
104   in[7] = _mm_srai_epi16(in[7], 5);
105 
106   recon_and_store(dest + 0 * stride, in[0]);
107   recon_and_store(dest + 1 * stride, in[1]);
108   recon_and_store(dest + 2 * stride, in[2]);
109   recon_and_store(dest + 3 * stride, in[3]);
110   recon_and_store(dest + 4 * stride, in[4]);
111   recon_and_store(dest + 5 * stride, in[5]);
112   recon_and_store(dest + 6 * stride, in[6]);
113   recon_and_store(dest + 7 * stride, in[7]);
114 }
115 
load_buffer_8x16(const tran_low_t * const input,__m128i * const in)116 static INLINE void load_buffer_8x16(const tran_low_t *const input,
117                                     __m128i *const in) {
118   in[0] = load_input_data8(input + 0 * 16);
119   in[1] = load_input_data8(input + 1 * 16);
120   in[2] = load_input_data8(input + 2 * 16);
121   in[3] = load_input_data8(input + 3 * 16);
122   in[4] = load_input_data8(input + 4 * 16);
123   in[5] = load_input_data8(input + 5 * 16);
124   in[6] = load_input_data8(input + 6 * 16);
125   in[7] = load_input_data8(input + 7 * 16);
126 
127   in[8] = load_input_data8(input + 8 * 16);
128   in[9] = load_input_data8(input + 9 * 16);
129   in[10] = load_input_data8(input + 10 * 16);
130   in[11] = load_input_data8(input + 11 * 16);
131   in[12] = load_input_data8(input + 12 * 16);
132   in[13] = load_input_data8(input + 13 * 16);
133   in[14] = load_input_data8(input + 14 * 16);
134   in[15] = load_input_data8(input + 15 * 16);
135 }
136 
write_buffer_8x16(uint8_t * const dest,__m128i * const in,const int stride)137 static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
138                                      const int stride) {
139   const __m128i final_rounding = _mm_set1_epi16(1 << 5);
140   // Final rounding and shift
141   in[0] = _mm_adds_epi16(in[0], final_rounding);
142   in[1] = _mm_adds_epi16(in[1], final_rounding);
143   in[2] = _mm_adds_epi16(in[2], final_rounding);
144   in[3] = _mm_adds_epi16(in[3], final_rounding);
145   in[4] = _mm_adds_epi16(in[4], final_rounding);
146   in[5] = _mm_adds_epi16(in[5], final_rounding);
147   in[6] = _mm_adds_epi16(in[6], final_rounding);
148   in[7] = _mm_adds_epi16(in[7], final_rounding);
149   in[8] = _mm_adds_epi16(in[8], final_rounding);
150   in[9] = _mm_adds_epi16(in[9], final_rounding);
151   in[10] = _mm_adds_epi16(in[10], final_rounding);
152   in[11] = _mm_adds_epi16(in[11], final_rounding);
153   in[12] = _mm_adds_epi16(in[12], final_rounding);
154   in[13] = _mm_adds_epi16(in[13], final_rounding);
155   in[14] = _mm_adds_epi16(in[14], final_rounding);
156   in[15] = _mm_adds_epi16(in[15], final_rounding);
157 
158   in[0] = _mm_srai_epi16(in[0], 6);
159   in[1] = _mm_srai_epi16(in[1], 6);
160   in[2] = _mm_srai_epi16(in[2], 6);
161   in[3] = _mm_srai_epi16(in[3], 6);
162   in[4] = _mm_srai_epi16(in[4], 6);
163   in[5] = _mm_srai_epi16(in[5], 6);
164   in[6] = _mm_srai_epi16(in[6], 6);
165   in[7] = _mm_srai_epi16(in[7], 6);
166   in[8] = _mm_srai_epi16(in[8], 6);
167   in[9] = _mm_srai_epi16(in[9], 6);
168   in[10] = _mm_srai_epi16(in[10], 6);
169   in[11] = _mm_srai_epi16(in[11], 6);
170   in[12] = _mm_srai_epi16(in[12], 6);
171   in[13] = _mm_srai_epi16(in[13], 6);
172   in[14] = _mm_srai_epi16(in[14], 6);
173   in[15] = _mm_srai_epi16(in[15], 6);
174 
175   recon_and_store(dest + 0 * stride, in[0]);
176   recon_and_store(dest + 1 * stride, in[1]);
177   recon_and_store(dest + 2 * stride, in[2]);
178   recon_and_store(dest + 3 * stride, in[3]);
179   recon_and_store(dest + 4 * stride, in[4]);
180   recon_and_store(dest + 5 * stride, in[5]);
181   recon_and_store(dest + 6 * stride, in[6]);
182   recon_and_store(dest + 7 * stride, in[7]);
183   recon_and_store(dest + 8 * stride, in[8]);
184   recon_and_store(dest + 9 * stride, in[9]);
185   recon_and_store(dest + 10 * stride, in[10]);
186   recon_and_store(dest + 11 * stride, in[11]);
187   recon_and_store(dest + 12 * stride, in[12]);
188   recon_and_store(dest + 13 * stride, in[13]);
189   recon_and_store(dest + 14 * stride, in[14]);
190   recon_and_store(dest + 15 * stride, in[15]);
191 }
192 
vp9_iht16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride,int tx_type)193 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
194                                int stride, int tx_type) {
195   __m128i in0[16], in1[16];
196 
197   load_buffer_8x16(input, in0);
198   input += 8;
199   load_buffer_8x16(input, in1);
200 
201   switch (tx_type) {
202     case DCT_DCT:
203       idct16_sse2(in0, in1);
204       idct16_sse2(in0, in1);
205       break;
206     case ADST_DCT:
207       idct16_sse2(in0, in1);
208       iadst16_sse2(in0, in1);
209       break;
210     case DCT_ADST:
211       iadst16_sse2(in0, in1);
212       idct16_sse2(in0, in1);
213       break;
214     default:
215       assert(tx_type == ADST_ADST);
216       iadst16_sse2(in0, in1);
217       iadst16_sse2(in0, in1);
218       break;
219   }
220 
221   write_buffer_8x16(dest, in0, stride);
222   dest += 8;
223   write_buffer_8x16(dest, in1, stride);
224 }
225