1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ih264_resi_trans_quant_sse42.c
24 *
25 * @brief
26 * Contains function definitions single stage forward transform for H.264
27 * It will calculate the residue, do the cf and then do quantization
28 *
29 * @author
30 * Mohit [100664]
31 *
32 * @par List of Functions:
33 * - ih264_resi_trans_quant_4x4_sse42()
34 * - ih264_resi_trans_quant_chroma_4x4_sse42()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 /* System include files */
42 #include <stddef.h>
43
44 /* User include files */
45 #include "ih264_typedefs.h"
46 #include "ih264_defs.h"
47 #include "ih264_size_defs.h"
48 #include "ih264_macros.h"
49 #include "ih264_trans_macros.h"
50 #include "ih264_trans_data.h"
51 #include "ih264_structs.h"
52 #include "ih264_trans_quant_itrans_iquant.h"
53 #include <immintrin.h>
54 /**
55 *******************************************************************************
56 *
57 * @brief
58 * This function performs forward transform and quantization on a 4*4 block
59 *
60 * @par Description:
61 * The function accepts source buffer and estimation buffer. From these, it
62 * computes the residue. This is residue is then transformed and quantized.
63 * The transform and quantization are in placed computed. They use the residue
64 * buffer for this.
65 *
66 * @param[in] pu1_src
67 * Pointer to source sub-block
68 *
69 * @param[in] pu1_pred
70 * Pointer to prediction sub-block
71 *
72 * @param[in] pi2_out
73 * Pointer to residual sub-block
74 *
75 * @param[in] src_strd
76 * Source stride
77 *
78 * @param[in] pred_strd
79 * Prediction stride
80 *
81 * @param[in] dst_strd
82 * Destination stride
83 *
84 * @param[in] u4_qbits
85 * QP_BITS_h264_4x4 + floor(QP/6)
86 *
87 * @param[in] pu2_threshold_matrix
88 * Pointer to Forward Quant Threshold Matrix
89 *
90 * @param[in] pu2_scale_matrix
91 * Pointer to Forward Quant Scale Matrix
92 *
93 * @param[in] u4_round_factor
94 * Quantization Round factor
95 *
96 * @param[out] pu1_nnz
97 * Total non-zero coefficients in the current sub-block
98 *
99 * @returns
100 *
101 * @remarks
102 * None
103 *
104 *******************************************************************************
105 */
ih264_resi_trans_quant_4x4_sse42(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pi2_alt_dc_addr)106 void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred,
107 WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
108 const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix,
109 UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz,
110 WORD16 *pi2_alt_dc_addr)
111 {
112 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
113 WORD32 mask0, mask1;
114 __m128i sum0, sum1, sum2, cmp0, cmp1;
115 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
116 __m128i temp_2 = _mm_set1_epi16(2);
117 __m128i temp_1 = _mm_set1_epi16(1);
118 __m128i src_r0, src_r1, src_r2, src_r3;
119 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
120 __m128i temp0, temp1, temp2, temp3;
121 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
122 __m128i sign_reg0, sign_reg2;
123 __m128i scalemat_r0_r1, scalemat_r2_r3;
124
125 UNUSED (pu2_threshold_matrix);
126
127 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
128 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
129 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
130 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
131 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
132 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
133
134 src_r0 = _mm_cvtepu8_epi16(src_r0);
135 src_r1 = _mm_cvtepu8_epi16(src_r1);
136 src_r2 = _mm_cvtepu8_epi16(src_r2);
137 src_r3 = _mm_cvtepu8_epi16(src_r3);
138
139 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
140 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
141 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
142 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
143
144 pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
145 pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
146 pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
147 pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
148
149 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
150 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
151 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
152 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
153
154 /* Perform Forward transform */
155 /*-------------------------------------------------------------*/
156 /* DCT [ Horizontal transformation ] */
157 /*-------------------------------------------------------------*/
158 // Matrix transpose
159 /*
160 * a0 a1 a2 a3
161 * b0 b1 b2 b3
162 * c0 c1 c2 c3
163 * d0 d1 d2 d3
164 */
165 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3
166 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3
167 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1
168 temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3
169
170 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0
171 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1
172 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2
173 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3
174
175 /*----------------------------------------------------------*/
176 /* x0 = z0 + z3 */
177 temp0 = _mm_add_epi16(src_r0, src_r3);
178 /* x1 = z1 + z2 */
179 temp1 = _mm_add_epi16(src_r1, src_r2);
180 /* x2 = z1 - z2 */
181 temp2 = _mm_sub_epi16(src_r1, src_r2);
182 /* x3 = z0 - z3 */
183 temp3 = _mm_sub_epi16(src_r0, src_r3);
184
185 /* z0 = x0 + x1 */
186 src_r0 = _mm_add_epi16(temp0, temp1);
187 /* z1 = (x3 << 1) + x2 */
188 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
189 src_r1 = _mm_add_epi16(src_r1, temp2);
190 /* z2 = x0 - x1 */
191 src_r2 = _mm_sub_epi16(temp0, temp1);
192 /* z3 = x3 - (x2 << 1) */
193 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
194 src_r3 = _mm_sub_epi16(temp3, src_r3);
195
196 // Matrix transpose
197 /*
198 * a0 b0 c0 d0
199 * a1 b1 c1 d1
200 * a2 b2 c2 d2
201 * a3 b3 c3 d3
202 */
203 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1
204 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3
205 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3
206 temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3
207
208 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3
209 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3
210 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3
211 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3
212
213 /*----------------------------------------------------------*/
214 /* x0 = z0 + z3 */
215 temp0 = _mm_add_epi16(src_r0, src_r3);
216 /* x1 = z1 + z2 */
217 temp1 = _mm_add_epi16(src_r1, src_r2);
218 /* x2 = z1 - z2 */
219 temp2 = _mm_sub_epi16(src_r1, src_r2);
220 /* x3 = z0 - z3 */
221 temp3 = _mm_sub_epi16(src_r0, src_r3);
222
223 /* z0 = x0 + x1 */
224 src_r0 = _mm_add_epi16(temp0, temp1);
225 /* z1 = (x3 << 1) + x2 */
226 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
227 src_r1 = _mm_add_epi16(src_r1, temp2);
228 /* z2 = x0 - x1 */
229 src_r2 = _mm_sub_epi16(temp0, temp1);
230 /* z3 = x3 - (x2 << 1) */
231 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
232 src_r3 = _mm_sub_epi16(temp3, src_r3);
233
234 tmp_dc = _mm_extract_epi16(src_r0,0); //a0
235 *pi2_alt_dc_addr = tmp_dc;
236
237 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3
238 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3
239 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
240 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
241
242 sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
243 sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
244
245 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
246 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
247
248 src_r0 = _mm_abs_epi16(src_r0);
249 src_r2 = _mm_abs_epi16(src_r2);
250
251 src_r1 = _mm_srli_si128(src_r0, 8);
252 src_r0 = _mm_cvtepu16_epi32(src_r0);
253 src_r1 = _mm_cvtepu16_epi32(src_r1);
254 src_r3 = _mm_srli_si128(src_r2, 8);
255 src_r2 = _mm_cvtepu16_epi32(src_r2);
256 src_r3 = _mm_cvtepu16_epi32(src_r3);
257
258 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
259 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
260 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
261 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
262 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
263 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
264
265 temp0 = _mm_mullo_epi32(temp0, src_r0);
266 temp1 = _mm_mullo_epi32(temp1, src_r1);
267 temp2 = _mm_mullo_epi32(temp2, src_r2);
268 temp3 = _mm_mullo_epi32(temp3, src_r3);
269
270 temp0 = _mm_add_epi32(temp0,rnd_fact);
271 temp1 = _mm_add_epi32(temp1,rnd_fact);
272 temp2 = _mm_add_epi32(temp2,rnd_fact);
273 temp3 = _mm_add_epi32(temp3,rnd_fact);
274
275 temp0 = _mm_srli_epi32(temp0,u4_qbits);
276 temp1 = _mm_srli_epi32(temp1,u4_qbits);
277 temp2 = _mm_srli_epi32(temp2,u4_qbits);
278 temp3 = _mm_srli_epi32(temp3,u4_qbits);
279
280 temp0 = _mm_packs_epi32 (temp0,temp1);
281 temp2 = _mm_packs_epi32 (temp2,temp3);
282
283 temp0 = _mm_sign_epi16(temp0, sign_reg0);
284 temp2 = _mm_sign_epi16(temp2, sign_reg2);
285
286 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
287 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
288
289 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
290 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
291
292 mask0 = _mm_movemask_epi8(cmp0);
293 mask1 = _mm_movemask_epi8(cmp1);
294 u4_zero_coeff = 0;
295 if(mask0)
296 {
297 if(mask0 == 0xffff)
298 u4_zero_coeff+=8;
299 else
300 {
301 cmp0 = _mm_and_si128(temp_1, cmp0);
302 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
303 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
304 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
305 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
306 }
307 }
308 if(mask1)
309 {
310 if(mask1 == 0xffff)
311 u4_zero_coeff+=8;
312 else
313 {
314 cmp1 = _mm_and_si128(temp_1, cmp1);
315 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
316 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
317 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
318 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
319 }
320 }
321
322 /* Return total nonzero coefficients in the current sub block */
323 u4_nonzero_coeff = 16 - u4_zero_coeff;
324 *pu1_nnz = u4_nonzero_coeff;
325 }
326
327 /**
328 *******************************************************************************
329 *
330 * @brief
331 * This function performs forward transform and quantization on a 4*4 chroma block
332 *
333 * @par Description:
334 * The function accepts source buffer and estimation buffer. From these, it
335 * computes the residue. This is residue is then transformed and quantized.
336 * The transform and quantization are in placed computed. They use the residue
337 * buffer for this.
338 *
339 * @param[in] pu1_src
340 * Pointer to source sub-block
341 *
342 * @param[in] pu1_pred
343 * Pointer to prediction sub-block
344 *
345 * @param[in] pi2_out
346 * Pointer to residual sub-block
347 *
348 * @param[in] src_strd
349 * Source stride
350 *
351 * @param[in] pred_strd
352 * Prediction stride
353 *
354 * @param[in] dst_strd
355 * Destination stride
356 *
357 * @param[in] u4_qbits
358 * QP_BITS_h264_4x4 + floor(QP/6)
359 *
360 * @param[in] pu2_threshold_matrix
361 * Pointer to Forward Quant Threshold Matrix
362 *
363 * @param[in] pu2_scale_matrix
364 * Pointer to Forward Quant Scale Matrix
365 *
366 * @param[in] u4_round_factor
367 * Quantization Round factor
368 *
369 * @param[out] pu1_nnz
370 * Total non-zero coefficients in the current sub-block
371 *
372 * @returns
373 *
374 * @remarks
375 * None
376 *
377 *******************************************************************************
378 */
ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pi2_alt_dc_addr)379 void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out,
380 WORD32 src_strd,WORD32 pred_strd,
381 const UWORD16 *pu2_scale_matrix,
382 const UWORD16 *pu2_threshold_matrix,
383 UWORD32 u4_qbits,UWORD32 u4_round_factor,
384 UWORD8 *pu1_nnz, WORD16 *pi2_alt_dc_addr)
385 {
386 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
387 WORD32 mask0, mask1;
388 __m128i cmp0, cmp1, sum0, sum1, sum2;
389 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
390 __m128i temp_2 = _mm_set1_epi16(2);
391 __m128i temp_1 = _mm_set1_epi16(1);
392 __m128i src_r0, src_r1, src_r2, src_r3;
393 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
394 __m128i temp0, temp1, temp2, temp3;
395 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
396 __m128i sign_reg0, sign_reg2;
397 __m128i scalemat_r0_r1, scalemat_r2_r3;
398 __m128i chroma_mask = _mm_set1_epi16 (0xFF);
399
400 UNUSED (pu2_threshold_matrix);
401
402 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
403 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
404 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
405 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
406 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
407 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
408
409 src_r0 = _mm_and_si128(src_r0, chroma_mask);
410 src_r1 = _mm_and_si128(src_r1, chroma_mask);
411 src_r2 = _mm_and_si128(src_r2, chroma_mask);
412 src_r3 = _mm_and_si128(src_r3, chroma_mask);
413 // src_r0 = _mm_cvtepu8_epi16(src_r0);
414 // src_r1 = _mm_cvtepu8_epi16(src_r1);
415 // src_r2 = _mm_cvtepu8_epi16(src_r2);
416 // src_r3 = _mm_cvtepu8_epi16(src_r3);
417
418 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
419 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
420 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
421 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
422
423 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
424 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
425 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
426 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
427 // pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
428 // pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
429 // pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
430 // pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
431
432 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
433 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
434 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
435 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
436
437 /* Perform Forward transform */
438 /*-------------------------------------------------------------*/
439 /* DCT [ Horizontal transformation ] */
440 /*-------------------------------------------------------------*/
441 // Matrix transpose
442 /*
443 * a0 a1 a2 a3
444 * b0 b1 b2 b3
445 * c0 c1 c2 c3
446 * d0 d1 d2 d3
447 */
448 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3
449 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3
450 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1
451 temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3
452
453 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0
454 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1
455 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2
456 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3
457
458 /*----------------------------------------------------------*/
459 /* x0 = z0 + z3 */
460 temp0 = _mm_add_epi16(src_r0, src_r3);
461 /* x1 = z1 + z2 */
462 temp1 = _mm_add_epi16(src_r1, src_r2);
463 /* x2 = z1 - z2 */
464 temp2 = _mm_sub_epi16(src_r1, src_r2);
465 /* x3 = z0 - z3 */
466 temp3 = _mm_sub_epi16(src_r0, src_r3);
467
468 /* z0 = x0 + x1 */
469 src_r0 = _mm_add_epi16(temp0, temp1);
470 /* z1 = (x3 << 1) + x2 */
471 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
472 src_r1 = _mm_add_epi16(src_r1, temp2);
473 /* z2 = x0 - x1 */
474 src_r2 = _mm_sub_epi16(temp0, temp1);
475 /* z3 = x3 - (x2 << 1) */
476 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
477 src_r3 = _mm_sub_epi16(temp3, src_r3);
478
479 // Matrix transpose
480 /*
481 * a0 b0 c0 d0
482 * a1 b1 c1 d1
483 * a2 b2 c2 d2
484 * a3 b3 c3 d3
485 */
486 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1
487 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3
488 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3
489 temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3
490
491 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3
492 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3
493 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3
494 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3
495
496 /*----------------------------------------------------------*/
497 /* x0 = z0 + z3 */
498 temp0 = _mm_add_epi16(src_r0, src_r3);
499 /* x1 = z1 + z2 */
500 temp1 = _mm_add_epi16(src_r1, src_r2);
501 /* x2 = z1 - z2 */
502 temp2 = _mm_sub_epi16(src_r1, src_r2);
503 /* x3 = z0 - z3 */
504 temp3 = _mm_sub_epi16(src_r0, src_r3);
505
506 /* z0 = x0 + x1 */
507 src_r0 = _mm_add_epi16(temp0, temp1);
508 /* z1 = (x3 << 1) + x2 */
509 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
510 src_r1 = _mm_add_epi16(src_r1, temp2);
511 /* z2 = x0 - x1 */
512 src_r2 = _mm_sub_epi16(temp0, temp1);
513 /* z3 = x3 - (x2 << 1) */
514 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
515 src_r3 = _mm_sub_epi16(temp3, src_r3);
516
517 tmp_dc = _mm_extract_epi16(src_r0,0); //a0
518 *pi2_alt_dc_addr = tmp_dc;
519
520 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3
521 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3
522 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
523 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
524
525 sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
526 sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
527
528 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
529 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
530
531 src_r0 = _mm_abs_epi16(src_r0);
532 src_r2 = _mm_abs_epi16(src_r2);
533
534 src_r1 = _mm_srli_si128(src_r0, 8);
535 src_r0 = _mm_cvtepu16_epi32(src_r0);
536 src_r1 = _mm_cvtepu16_epi32(src_r1);
537 src_r3 = _mm_srli_si128(src_r2, 8);
538 src_r2 = _mm_cvtepu16_epi32(src_r2);
539 src_r3 = _mm_cvtepu16_epi32(src_r3);
540
541 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
542 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
543 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
544 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
545 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
546 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
547
548 temp0 = _mm_mullo_epi32(temp0, src_r0);
549 temp1 = _mm_mullo_epi32(temp1, src_r1);
550 temp2 = _mm_mullo_epi32(temp2, src_r2);
551 temp3 = _mm_mullo_epi32(temp3, src_r3);
552
553 temp0 = _mm_add_epi32(temp0,rnd_fact);
554 temp1 = _mm_add_epi32(temp1,rnd_fact);
555 temp2 = _mm_add_epi32(temp2,rnd_fact);
556 temp3 = _mm_add_epi32(temp3,rnd_fact);
557
558 temp0 = _mm_srli_epi32(temp0,u4_qbits);
559 temp1 = _mm_srli_epi32(temp1,u4_qbits);
560 temp2 = _mm_srli_epi32(temp2,u4_qbits);
561 temp3 = _mm_srli_epi32(temp3,u4_qbits);
562
563 temp0 = _mm_packs_epi32 (temp0,temp1);
564 temp2 = _mm_packs_epi32 (temp2,temp3);
565
566 temp0 = _mm_sign_epi16(temp0, sign_reg0);
567 temp2 = _mm_sign_epi16(temp2, sign_reg2);
568
569 //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0);
570
571 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
572 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
573
574 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
575 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
576
577 mask0 = _mm_movemask_epi8(cmp0);
578 mask1 = _mm_movemask_epi8(cmp1);
579 u4_zero_coeff = 0;
580 if(mask0)
581 {
582 if(mask0 == 0xffff)
583 u4_zero_coeff+=8;
584 else
585 {
586 cmp0 = _mm_and_si128(temp_1, cmp0);
587 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
588 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
589 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
590 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
591 }
592 }
593 if(mask1)
594 {
595 if(mask1 == 0xffff)
596 u4_zero_coeff+=8;
597 else
598 {
599 cmp1 = _mm_and_si128(temp_1, cmp1);
600 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
601 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
602 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
603 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
604 }
605 }
606
607 /* Return total nonzero coefficients in the current sub block */
608 u4_nonzero_coeff = 16 - u4_zero_coeff;
609 *pu1_nnz = u4_nonzero_coeff;
610
611 }
612
613
614 /**
615 *******************************************************************************
616 *
617 * @brief
618 * This function performs forward hadamard transform and quantization on a 4*4 block
619 *
620 * @par Description:
621 * The function accepts source buffer and estimation buffer. From these, it
622 * computes the residue. This is residue is then transformed and quantized.
623 * The transform and quantization are in placed computed. They use the residue
624 * buffer for this.
625 *
626 * @param[in] pu1_src
627 * Pointer to source sub-block
628 *
629 * @param[in] pu1_pred
630 * Pointer to prediction sub-block
631 *
632 * @param[in] pi2_out
633 * Pointer to residual sub-block
634 *
635 * @param[in] src_strd
636 * Source stride
637 *
638 * @param[in] pred_strd
639 * Prediction stride
640 *
641 * @param[in] dst_strd
642 * Destination stride
643 *
644 * @param[in] u4_qbits
645 * QP_BITS_h264_4x4 + floor(QP/6)
646 *
647 * @param[in] pu2_threshold_matrix
648 * Pointer to Forward Quant Threshold Matrix
649 *
650 * @param[in] pu2_scale_matrix
651 * Pointer to Forward Quant Scale Matrix
652 *
653 * @param[in] u4_round_factor
654 * Quantization Round factor
655 *
656 * @param[out] pu1_nnz
657 * Total non-zero coefficients in the current sub-block
658 *
659 * @returns
660 *
661 * @remarks
662 * None
663 *
664 */
665
ih264_hadamard_quant_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)666 void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
667 const UWORD16 *pu2_scale_matrix,
668 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
669 UWORD32 u4_round_factor,UWORD8 *pu1_nnz
670 )
671 {
672 WORD32 u4_zero_coeff,u4_nonzero_coeff=0;
673 __m128i cmp0, cmp1, sum0, sum1, sum2;
674 WORD32 mask0, mask1;
675 __m128i src_r0_r1, src_r2_r3, sign_reg;
676 __m128i src_r0, src_r1, src_r2, src_r3;
677 __m128i zero_8x16b = _mm_setzero_si128();
678 __m128i temp0, temp1, temp2, temp3;
679 __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3;
680 __m128i temp_1 = _mm_set1_epi16(1);
681 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
682 __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
683
684 UNUSED (pu2_threshold_matrix);
685
686 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
687 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
688 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
689 src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); //a0 a1 a2 a3
690 src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); //b0 b1 b2 b3
691 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
692 src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); //c0 c1 c2 c3
693 src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); //d0 d1 d2 d3
694
695 /* Perform Inverse transform */
696 /*-------------------------------------------------------------*/
697 /* Forward DC transform [ Horizontal transformation ] */
698 /*-------------------------------------------------------------*/
699 // Matrix transpose
700 /*
701 * a0 a1 a2 a3
702 * b0 b1 b2 b3
703 * c0 c1 c2 c3
704 * d0 d1 d2 d3
705 */
706 temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1
707 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1
708 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3
709 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3
710 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0
711 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1
712 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2
713 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3
714
715 temp0 = _mm_add_epi32(src_r0, src_r3);
716 temp1 = _mm_add_epi32(src_r1, src_r2);
717 temp2 = _mm_sub_epi32(src_r1, src_r2);
718 temp3 = _mm_sub_epi32(src_r0, src_r3);
719
720 src_r0 = _mm_add_epi32(temp0, temp1);
721 src_r1 = _mm_add_epi32(temp2, temp3);
722 src_r2 = _mm_sub_epi32(temp0, temp1);
723 src_r3 = _mm_sub_epi32(temp3, temp2);
724
725 /*-------------------------------------------------------------*/
726 /* Forward DC transform [ Vertical transformation ] */
727 /*-------------------------------------------------------------*/
728 // Matrix transpose
729 /*
730 * a0 b0 c0 d0
731 * a1 b1 c1 d1
732 * a2 b2 c2 d2
733 * a3 b3 c3 d3
734 */
735 temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1
736 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3
737 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1
738 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3
739 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3
740 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3
741 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3
742 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3
743
744 temp0 = _mm_add_epi32(src_r0, src_r3);
745 temp1 = _mm_add_epi32(src_r1, src_r2);
746 temp2 = _mm_sub_epi32(src_r1, src_r2);
747 temp3 = _mm_sub_epi32(src_r0, src_r3);
748
749 src_r0 = _mm_add_epi32(temp0, temp1);
750 src_r1 = _mm_add_epi32(temp2, temp3);
751 src_r2 = _mm_sub_epi32(temp0, temp1);
752 src_r3 = _mm_sub_epi32(temp3, temp2);
753
754 src_r0 = _mm_srai_epi32(src_r0, 1);
755 src_r1 = _mm_srai_epi32(src_r1, 1);
756 src_r2 = _mm_srai_epi32(src_r2, 1);
757 src_r3 = _mm_srai_epi32(src_r3, 1);
758
759 // Quantization
760 sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0); //Find sign of each value for later restoration
761 sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1);
762 sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2);
763 sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3);
764
765 sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively
766 sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3);
767
768 sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively
769 sign_reg2 = _mm_slli_epi16(sign_reg2, 1);
770
771 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively
772 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
773
774 src_r0 = _mm_abs_epi32(src_r0); //Absolute values
775 src_r1 = _mm_abs_epi32(src_r1);
776 src_r2 = _mm_abs_epi32(src_r2);
777 src_r3 = _mm_abs_epi32(src_r3);
778
779 temp0 = _mm_mullo_epi32(scale_val, src_r0); //multiply by pu2_scale_matrix[0]
780 temp1 = _mm_mullo_epi32(scale_val, src_r1);
781 temp2 = _mm_mullo_epi32(scale_val, src_r2);
782 temp3 = _mm_mullo_epi32(scale_val, src_r3);
783
784 temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor
785 temp1 = _mm_add_epi32(temp1,rnd_fact);
786 temp2 = _mm_add_epi32(temp2,rnd_fact);
787 temp3 = _mm_add_epi32(temp3,rnd_fact);
788
789 temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works
790 temp1 = _mm_srli_epi32(temp1,u4_qbits);
791 temp2 = _mm_srli_epi32(temp2,u4_qbits);
792 temp3 = _mm_srli_epi32(temp3,u4_qbits);
793
794 temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only.
795 temp2 = _mm_packs_epi32 (temp2,temp3);
796
797 temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration
798 temp2 = _mm_sign_epi16(temp2, sign_reg2);
799
800 _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
801 _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2);
802
803 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
804 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
805
806 mask0 = _mm_movemask_epi8(cmp0);
807 mask1 = _mm_movemask_epi8(cmp1);
808 u4_zero_coeff = 0;
809 if(mask0)
810 {
811 if(mask0 == 0xffff)
812 u4_zero_coeff+=8;
813 else
814 {
815 cmp0 = _mm_and_si128(temp_1, cmp0);
816 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
817 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
818 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
819 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
820 }
821 }
822 if(mask1)
823 {
824 if(mask1 == 0xffff)
825 u4_zero_coeff+=8;
826 else
827 {
828 cmp1 = _mm_and_si128(temp_1, cmp1);
829 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
830 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
831 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
832 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
833 }
834 }
835
836 /* Return total nonzero coefficients in the current sub block */
837 u4_nonzero_coeff = 16 - u4_zero_coeff;
838 pu1_nnz[0] = u4_nonzero_coeff;
839 }
840
841
842 /**
843 *******************************************************************************
844 *
845 * @brief
846 * This function performs forward hadamard transform and quantization on a 2*2 block
847 * for both U and V planes
848 *
849 * @par Description:
850 * The function accepts source buffer and estimation buffer. From these, it
851 * computes the residue. This is residue is then transformed and quantized.
852 * The transform and quantization are in placed computed. They use the residue
853 * buffer for this.
854 *
855 * @param[in] pu1_src
856 * Pointer to source sub-block
857 *
858 * @param[in] pu1_pred
859 * Pointer to prediction sub-block
860 *
861 * @param[in] pi2_out
862 * Pointer to residual sub-block
863 *
864 * @param[in] src_strd
865 * Source stride
866 *
867 * @param[in] pred_strd
868 * Prediction stride
869 *
870 * @param[in] dst_strd
871 * Destination stride
872 *
873 * @param[in] u4_qbits
874 * QP_BITS_h264_4x4 + floor(QP/6)
875 *
876 * @param[in] pu2_threshold_matrix
877 * Pointer to Forward Quant Threshold Matrix
878 *
879 * @param[in] pu2_scale_matrix
880 * Pointer to Forward Quant Scale Matrix
881 *
882 * @param[in] u4_round_factor
883 * Quantization Round factor
884 *
885 * @param[out] pu1_nnz
886 * Total non-zero coefficients in the current sub-block
887 *
888 * @returns
889 *
890 * @remarks
891 * NNZ for dc is populated at 0 and 5th position of pu1_nnz
892 *
893 */
894
ih264_hadamard_quant_2x2_uv_sse42(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)895 void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
896 const UWORD16 *pu2_scale_matrix,
897 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
898 UWORD32 u4_round_factor,UWORD8 *pu1_nnz)
899 {
900 WORD32 val, nonzero_coeff_0=0, nonzero_coeff_1=0;
901 __m128i cmp, cmp0, cmp1;
902 __m128i sum0, sum1;
903 WORD32 mask, mask0, mask1;
904 __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
905 __m128i zero_8x16b = _mm_setzero_si128();
906 __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
907 __m128i sign_reg0, sign_reg1;
908 __m128i temp_1 = _mm_set1_epi16(1);
909 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
910
911 UNUSED (pu2_threshold_matrix);
912
913 src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3
914 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
915 plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits
916 plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits
917
918 temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3
919 temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3
920
921 plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
922 plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
923
924 temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
925 temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
926
927 plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
928 plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
929
930 plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
931 plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
932 // Quantization
933 sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0); //Find sign of each value for later restoration
934 sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1);
935
936 sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively
937 sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively
938 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively
939
940 plane_0 = _mm_abs_epi32(plane_0); //Absolute values
941 plane_1 = _mm_abs_epi32(plane_1);
942
943 temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_scale_matrix[0]
944 temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_scale_matrix[0]
945
946 temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor
947 temp1 = _mm_add_epi32(temp1,rnd_fact);
948
949 temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works
950 temp1 = _mm_srli_epi32(temp1,u4_qbits);
951
952 temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only.
953 temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration
954
955 _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
956
957 cmp = _mm_cmpeq_epi16(temp0, zero_8x16b);
958 mask = _mm_movemask_epi8(cmp);
959 mask0 = mask & 0xff;
960 mask1 = mask>>8;
961 if(mask0)
962 {
963 if(mask0 == 0xff)
964 nonzero_coeff_0 += 4;
965 else
966 {
967 cmp0 = _mm_and_si128(temp_1, cmp);
968 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
969 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
970 val = _mm_cvtsi128_si32(sum1);
971 val = val & 0xffff;
972 nonzero_coeff_0 += val;
973 }
974 }
975 if(mask1)
976 {
977 if(mask1 == 0xff)
978 nonzero_coeff_1 += 4;
979 else
980 {
981 cmp1 = _mm_srli_si128(cmp, 8);
982 cmp1 = _mm_and_si128(temp_1, cmp1);
983 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
984 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
985 nonzero_coeff_1 += _mm_cvtsi128_si32(sum1);
986 }
987 }
988
989 pu1_nnz[0] = 4 - nonzero_coeff_0;
990 pu1_nnz[1] = 4 - nonzero_coeff_1;
991
992 }
993