1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_iquant_itrans_recon_atom_intr.c
22 *
23 * @brief
24 * Contains function definitions for inverse quantization, inverse
25 * transform and reconstruction
26 *
27 * @author
28 * 100470
29 * 100592 (edited by)
30 *
31 * @par List of Functions:
32 * - ihevc_iquant_itrans_recon_16x16_ssse3()
33 *
34 * @remarks
35 * None
36 *
37 *******************************************************************************
38 */
39 #include <stdio.h>
40 #include <string.h>
41 #include "ihevc_typedefs.h"
42 #include "ihevc_macros.h"
43 #include "ihevc_platform_macros.h"
44 #include "ihevc_defs.h"
45 #include "ihevc_trans_tables.h"
46 #include "ihevc_itrans_recon.h"
47 #include "ihevc_func_selector.h"
48 #include "ihevc_trans_macros.h"
49
50
51
52 #include <immintrin.h>
53 #include <emmintrin.h>
54
55 #include <tmmintrin.h>
56
57
58 /**
59 *******************************************************************************
60 *
61 * @brief
62 * This function performs inverse quantization, inverse transform and
63 * reconstruction for 16x16 input block
64 *
65 * @par Description:
66 * Performs inverse quantization , inverse transform and adds the
67 * prediction data and clips output to 8 bit
68 *
69 * @param[in] pi2_src
70 * Input 16x16 coefficients
71 *
72 * @param[in] pi2_tmp
73 * Temporary 16x16 buffer for storing inverse
74 * transform 1st stage output
75 *
76 * @param[in] pu1_pred
77 * Prediction 16x16 block
78 *
79 * @param[in] pi2_dequant_coeff
80 * Dequant Coeffs
81 *
82 * @param[out] pu1_dst
83 * Output 16x16 block
84 *
85 * @param[in] qp_div
86 * Quantization parameter / 6
87 *
88 * @param[in] qp_rem
89 * Quantization parameter % 6
90 *
91 * @param[in] src_strd
92 * Input stride
93 *
94 * @param[in] pred_strd
95 * Prediction stride
96 *
97 * @param[in] dst_strd
98 * Output Stride
99 *
100 * @param[in] zero_cols
101 * Zero columns in pi2_src
102 *
103 * @returns Void
104 *
105 * @remarks
106 * None
107 *
108 *******************************************************************************
109 */
110
ihevc_itrans_recon_16x16_ssse3(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)111 void ihevc_itrans_recon_16x16_ssse3(WORD16 *pi2_src,
112 WORD16 *pi2_tmp,
113 UWORD8 *pu1_pred,
114 UWORD8 *pu1_dst,
115 WORD32 src_strd,
116 WORD32 pred_strd,
117 WORD32 dst_strd,
118 WORD32 zero_cols,
119 WORD32 zero_rows)
120 {
121 __m128i m_temp_reg_0;
122 __m128i m_temp_reg_1;
123 __m128i m_temp_reg_10;
124 __m128i m_temp_reg_11;
125 __m128i m_temp_reg_12;
126 __m128i m_temp_reg_13;
127 __m128i m_temp_reg_14;
128
129 __m128i m_temp_reg_20;
130 __m128i m_temp_reg_21;
131 __m128i m_temp_reg_22;
132 __m128i m_temp_reg_23;
133 __m128i m_temp_reg_24;
134 __m128i m_temp_reg_25;
135 __m128i m_temp_reg_26;
136 __m128i m_temp_reg_27;
137 __m128i m_temp_reg_30;
138 __m128i m_temp_reg_31;
139 __m128i m_temp_reg_32;
140 __m128i m_temp_reg_33;
141 __m128i m_temp_reg_34;
142 __m128i m_temp_reg_35;
143 __m128i m_temp_reg_36;
144 __m128i m_temp_reg_37;
145 __m128i m_temp_reg_40;
146 __m128i m_temp_reg_41;
147 __m128i m_temp_reg_42;
148 __m128i m_temp_reg_43;
149 __m128i m_temp_reg_44;
150 __m128i m_temp_reg_45;
151 __m128i m_temp_reg_46;
152 __m128i m_temp_reg_47;
153
154 __m128i m_temp_reg_70;
155 __m128i m_temp_reg_71;
156 __m128i m_temp_reg_72;
157 __m128i m_temp_reg_73;
158 __m128i m_temp_reg_74;
159 __m128i m_temp_reg_75;
160 __m128i m_temp_reg_76;
161 __m128i m_temp_reg_77;
162 __m128i m_rdng_factor;
163 __m128i m_count;
164 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
165 __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
166
167 WORD32 i;
168 /*Lokesh*/
169 WORD32 zero_last8_cols_stg1;
170 WORD32 zero_last8_rows_stg1;
171 WORD32 zero_last12_rows_stg1;
172 WORD32 zero_last12_rows_stg2;
173 WORD32 zero_last8_rows_stg2;
174
175 WORD32 loop = 0;
176
177 WORD32 i4_shift = IT_SHIFT_STAGE_1;
178 WORD32 trans_size = TRANS_SIZE_16;
179
180
181
182
183 /* Following 3 instructions replicates the value in the */
184 /* lower 16 bits of m_add_iq in the entire register */
185
186 /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
187
188 zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
189 zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
190 zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
191
192 zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
193 zero_last8_rows_stg2 = zero_last8_cols_stg1;
194 if(zero_last8_cols_stg1)
195 {
196 loop = 1;
197 }
198 else
199 loop = 2;
200
201 /* i = 0 => lower 8 samples */
202 /* i = 1 => higher 8 samples */
203 for(i = 0; i < loop; i++)
204 {
205 {
206 WORD32 sample_half_index = i << 3;
207 WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
208 WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
209
210 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
211 pi2_tmp_src += (src_strd << 1);
212 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
213 pi2_tmp_src += (src_strd << 1);
214 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
215 pi2_tmp_src += (src_strd << 1);
216 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
217 pi2_tmp_src += (src_strd << 1);
218 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
219 pi2_tmp_src += (src_strd << 1);
220 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
221 pi2_tmp_src += (src_strd << 1);
222 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
223 pi2_tmp_src += (src_strd << 1);
224 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
225 pi2_tmp_src += (src_strd << 1);
226
227
228
229
230 /* If last 12 rows are zero : Rishab */
231 if(zero_last12_rows_stg1)
232 {
233
234 /* eee */
235 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
236 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
237 {
238 /* Loading coeff and src for use in next block */
239
240 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
241 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
242
243 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
244
245 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
246
247 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
248
249 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
250
251 m_temp_reg_26 = m_temp_reg_24;
252 m_temp_reg_27 = m_temp_reg_25;
253 }
254
255 /* eo */
256
257 /* eo0[0-3] */
258 {
259 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
260 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
261
262 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
263
264 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
265
266 /* e[0][0-3] stored in pi2_tmp[0][0-7] */
267 /* e[7][0-3] stored in pi2_tmp[0][8-15] */
268 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
269 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
270
271 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
272 pi2_scratch += 8;
273 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
274 pi2_scratch += 8;
275
276 }
277
278
279 /* eo0[4-7] */
280 {
281 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
282
283 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
284
285 /* e[0][4-7] stored in pi2_tmp[1][0-7] */
286 /* e[7][4-7] stored in pi2_tmp[1][8-15] */
287 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
288 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
289
290 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
291 pi2_scratch += 8;
292 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
293 pi2_scratch += 8;
294
295 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
296 }
297
298 /* eo1[0-3] */
299 {
300 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
301
302 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
303
304 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
305 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
306 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
307 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
308
309 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
310 pi2_scratch += 8;
311 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
312 pi2_scratch += 8;
313 }
314
315 /* eo1[4-7] */
316 {
317 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
318
319 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
320
321 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
322 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
323 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
324 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
325
326 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
327 pi2_scratch += 8;
328 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
329 pi2_scratch += 8;
330
331 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
332
333 }
334
335 /* eo2[0-3] */
336 {
337 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
338
339 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
340 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
341 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
342 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
343
344 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
345 pi2_scratch += 8;
346 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
347 pi2_scratch += 8;
348
349 }
350
351 /* eo2[4-7] */
352 {
353 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
354
355 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
356 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
357 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
358 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
359
360
361 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
362 pi2_scratch += 8;
363 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
364 pi2_scratch += 8;
365
366 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
367 }
368
369 /* eo3[0-3] */
370 {
371 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
372
373 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
374 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
375 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
376 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
377
378 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
379 pi2_scratch += 8;
380 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
381 pi2_scratch += 8;
382 }
383
384 /* eo3[4-7] */
385 {
386 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
387
388 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
389 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
390 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
391 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
392
393 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
394 pi2_scratch += 8;
395 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
396 pi2_scratch += 8;
397 }
398 }
399 /* If last 8 rows are zero : Rishab */
400 else if(zero_last8_rows_stg1)
401 {
402 /* eeo */
403 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
404 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
405 {
406 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
407 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
408
409 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
410 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
411
412 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
413 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
414
415 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
416 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
417
418 }
419
420 /* eee */
421 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
422 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
423 {
424 /* Loading coeff and src for use in next block */
425 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs
426 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
427
428 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
429
430 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
431
432 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
433
434 m_temp_reg_26 = m_temp_reg_24;
435 m_temp_reg_27 = m_temp_reg_25;
436
437 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
438 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
439 }
440
441 /* eo0[0-3] */
442 {
443 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
444 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
445
446 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
447
448 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
449 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
450 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
451
452 /* e[0][0-3] stored in pi2_tmp[0][0-7] */
453 /* e[7][0-3] stored in pi2_tmp[0][8-15] */
454 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
455 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
456
457 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
458 pi2_scratch += 8;
459 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
460 pi2_scratch += 8;
461
462 }
463
464 /* eo0[4-7] */
465 {
466 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
467
468 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
469 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
470 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
471
472 /* e[0][4-7] stored in pi2_tmp[1][0-7] */
473 /* e[7][4-7] stored in pi2_tmp[1][8-15] */
474 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
475 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
476
477 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
478 pi2_scratch += 8;
479 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
480 pi2_scratch += 8;
481
482 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
483 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
484
485 }
486
487 /* eo1[0-3] */
488 {
489 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
490
491 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
492 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
493 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
494
495 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
496 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
497 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
498 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
499
500 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
501 pi2_scratch += 8;
502 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
503 pi2_scratch += 8;
504
505 }
506
507 /* eo1[4-7] */
508 {
509 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
510
511 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
512 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
513 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
514
515 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
516 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
517 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
518 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
519
520 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
521 pi2_scratch += 8;
522 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
523 pi2_scratch += 8;
524
525 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
526 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
527
528 }
529
530 /* eo2[0-3] */
531 {
532 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
533
534 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
535 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
536 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
537 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
538
539 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
540 pi2_scratch += 8;
541 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
542 pi2_scratch += 8;
543
544 }
545
546 /* eo2[4-7] */
547 {
548 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
549
550 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
551 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
552 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
553 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
554
555 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
556 pi2_scratch += 8;
557 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
558 pi2_scratch += 8;
559
560 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
561 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
562
563 }
564
565 /* eo3[0-3] */
566 {
567 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
568
569 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
570 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
571 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
572 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
573
574 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
575 pi2_scratch += 8;
576 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
577 pi2_scratch += 8;
578 }
579
580 /* eo3[4-7] */
581 {
582 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
583
584 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
585 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
586 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
587 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
588
589 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
590 pi2_scratch += 8;
591 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
592 pi2_scratch += 8;
593 }
594 } /* If all the rows are non-zero : Rishab */
595 else
596 {
597 /* eeo */
598 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
599 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
600
601 {
602 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
603 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
604
605 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
606 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
607
608 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
609 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
610
611 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
612 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
613 }
614
615 /* eee */
616 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
617 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
618 {
619 /* Loading coeff and src for use in next block */
620 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64
621 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
622
623 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
624 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
625
626 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
627 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
628
629 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
630 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
631
632 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
633 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
634
635 }
636 /* eo0[0-3] */
637 {
638 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
639 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
640 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
641 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
642
643 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
644 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
645
646
647 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
648 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
649 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
650
651 /* e[0][0-3] stored in pi2_tmp[0][0-7] */
652 /* e[7][0-3] stored in pi2_tmp[0][8-15] */
653 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
654 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
655 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
656 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
657
658 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
659 pi2_scratch += 8;
660 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
661 pi2_scratch += 8;
662
663
664 }
665
666 /* eo0[4-7] */
667 {
668 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
669 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
670
671 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
672 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
673 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
674
675 /* e[0][4-7] stored in pi2_tmp[1][0-7] */
676 /* e[7][4-7] stored in pi2_tmp[1][8-15] */
677 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
678 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
679 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
680 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
681
682 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
683 pi2_scratch += 8;
684 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
685 pi2_scratch += 8;
686
687 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
688 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
689
690 }
691
692 /* eo1[0-3] */
693 {
694 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
695 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
696
697 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
698 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
699 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
700
701 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
702 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
703 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
704 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
705 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
706 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
707
708 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
709 pi2_scratch += 8;
710 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
711 pi2_scratch += 8;
712
713 }
714
715 /* eo1[4-7] */
716 {
717 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
718 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
719
720 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
721 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
722 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
723
724 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
725 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
726 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
727 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
728 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
729 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
730
731 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
732 pi2_scratch += 8;
733 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
734 pi2_scratch += 8;
735 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
736 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
737 }
738
739 /* eo2[0-3] */
740 {
741 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
742 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
743
744 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
745 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
746 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
747 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
748 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
749 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
750
751 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
752 pi2_scratch += 8;
753 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
754 pi2_scratch += 8;
755 }
756
757 /* eo2[4-7] */
758 {
759 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
760 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
761
762 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
763 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
764 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
765 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
766 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
767 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
768
769 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
770 pi2_scratch += 8;
771 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
772 pi2_scratch += 8;
773
774 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
775 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
776
777 }
778
779 /* eo3[0-3] */
780 {
781 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
782 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
783
784 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
785 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
786 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
787 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
788 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
789 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
790
791
792 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
793 pi2_scratch += 8;
794 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
795 pi2_scratch += 8;
796 }
797
798 /* eo3[4-7] */
799 {
800 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
801 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
802
803 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
804 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
805 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
806 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
807 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
808 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
809
810 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
811 pi2_scratch += 8;
812 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
813 pi2_scratch += 8;
814 }
815
816 }
817 }
818
819 {
820 WORD32 sample_half_index = i << 3;
821 WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
822
823 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
824 pi2_tmp_src += (src_strd << 1);
825 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
826 pi2_tmp_src += (src_strd << 1);
827 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
828 pi2_tmp_src += (src_strd << 1);
829 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
830 pi2_tmp_src += (src_strd << 1);
831 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
832 pi2_tmp_src += (src_strd << 1);
833 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
834 pi2_tmp_src += (src_strd << 1);
835 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
836 pi2_tmp_src += (src_strd << 1);
837 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
838 pi2_tmp_src += (src_strd << 1);
839 }
840
841 /* o & stage 1 out */
842 {
843 WORD32 j;
844 WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
845 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
846 WORD32 out_stride = (trans_size << 1);
847 WORD32 in_stride = trans_size << 1;
848
849 if(zero_last12_rows_stg1)
850 {
851 for(j = 0; j < 2; j++)
852 {
853 if(j) //H8B= higher 8 bytes L8B lower 8 bytes
854 {
855 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
856 }
857 else
858 {
859 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
860 }
861 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
862
863
864 /* o0[0-3] */
865 {
866 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
867
868
869 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
870 pi2_src_scratch += in_stride;
871
872 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
873
874
875 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
876 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
877
878 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
879 m_count = _mm_cvtsi32_si128(i4_shift);
880 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
881
882 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
883 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
884 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
885 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
886
887 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
888
889 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
890 pi2_dst_scratch += out_stride;
891 }
892
893 /* o1[0-3] */
894 {
895 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
896
897
898 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
899 pi2_src_scratch += in_stride;
900
901 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
902
903 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
904 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
905
906 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
907 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
908 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
909 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
910
911 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
912
913 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
914 pi2_dst_scratch += out_stride;
915 }
916
917 /* o2[0-3] */
918 {
919 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
920
921 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
922 pi2_src_scratch += in_stride;
923
924 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
925
926 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
927 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
928
929 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
930 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
931 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
932 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
933
934 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
935
936 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
937 pi2_dst_scratch += out_stride;
938 }
939
940 /* o3[0-3] */
941 {
942 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
943
944 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
945 pi2_src_scratch += 8;
946
947 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
948
949 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
950 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
951
952 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
953 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
954 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
955 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
956
957 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
958
959 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
960 pi2_dst_scratch += 8;
961 }
962
963 /* o4[0-3] */
964 {
965 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
966
967 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
968 pi2_src_scratch -= in_stride;
969
970 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
971
972 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
973 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
974
975 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
976 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
977 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
978 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
979
980 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
981
982 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
983 pi2_dst_scratch -= out_stride;
984 }
985
986 /* o5[0-3] */
987 {
988 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
989
990
991 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
992 pi2_src_scratch -= in_stride;
993
994 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
995
996 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
997 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
998
999 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1000 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1001 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1002 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1003
1004 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1005
1006 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1007 pi2_dst_scratch -= out_stride;
1008 }
1009
1010 /* o6[0-3] */
1011 {
1012 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1013
1014 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1015 pi2_src_scratch -= in_stride;
1016
1017 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1018
1019 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1020 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1021
1022 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1023 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1024 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1025 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1026
1027 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1028
1029 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1030 pi2_dst_scratch -= out_stride;
1031 }
1032
1033 /* o7[0-3] */
1034 {
1035 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1036
1037 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1038 pi2_src_scratch += 8;
1039
1040 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1041 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1042
1043 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1044 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1045 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1046 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1047
1048 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1049
1050 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1051 pi2_dst_scratch += 8;
1052 }
1053 }
1054 }
1055 else if(zero_last8_rows_stg1)
1056 {
1057 for(j = 0; j < 2; j++)
1058 {
1059 if(j)
1060 {
1061 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
1062 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
1063 }
1064 else
1065 {
1066 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
1067 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
1068 }
1069 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
1070 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
1071
1072 /* o0[0-3] */
1073 {
1074 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1075 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1076
1077 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1078 pi2_src_scratch += in_stride;
1079
1080 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
1081 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
1082
1083 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1084 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1085 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1086
1087
1088 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1089 m_count = _mm_cvtsi32_si128(i4_shift);
1090
1091 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
1092
1093 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1094 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1095 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1096 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1097
1098 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1099
1100 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1101 pi2_dst_scratch += out_stride;
1102 }
1103
1104 /* o1[0-3] */
1105 {
1106 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1107 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1108
1109 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1110 pi2_src_scratch += in_stride;
1111
1112 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
1113 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
1114
1115 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1116 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1117 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1118
1119 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1120 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1121 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1122 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1123
1124 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1125
1126 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1127 pi2_dst_scratch += out_stride;
1128 }
1129
1130 /* o2[0-3] */
1131 {
1132 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1133 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1134
1135 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1136 pi2_src_scratch += in_stride;
1137
1138 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
1139 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
1140
1141 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1142 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1143 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1144
1145 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1146 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1147 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1148 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1149
1150 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1151
1152 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1153 pi2_dst_scratch += out_stride;
1154 }
1155
1156 /* o3[0-3] */
1157 {
1158 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1159 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1160
1161 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1162 pi2_src_scratch += 8;
1163
1164 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
1165 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
1166
1167 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
1168 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1169 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1170
1171 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1172 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1173 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1174 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1175
1176 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1177
1178 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1179 pi2_dst_scratch += 8;
1180 }
1181
1182 /* o4[0-3] */
1183 {
1184 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1185 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1186
1187 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1188 pi2_src_scratch -= in_stride;
1189
1190 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
1191 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
1192
1193 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1194 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1195 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1196
1197
1198 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1199 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1200 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1201 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1202
1203 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1204
1205 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1206 pi2_dst_scratch -= out_stride;
1207 }
1208
1209 /* o5[0-3] */
1210 {
1211 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1212 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1213
1214 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1215 pi2_src_scratch -= in_stride;
1216
1217 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
1218 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
1219
1220 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1221 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1222 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1223
1224
1225 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1226 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1227 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1228 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1229
1230 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1231
1232 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1233 pi2_dst_scratch -= out_stride;
1234 }
1235
1236 /* o6[0-3] */
1237 {
1238 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1239 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1240
1241 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1242 pi2_src_scratch -= in_stride;
1243
1244 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1245 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
1246
1247 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1248 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1249 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1250
1251 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1252 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1253 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1254 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1255
1256 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1257
1258 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1259 pi2_dst_scratch -= out_stride;
1260 }
1261
1262 /* o7[0-3] */
1263 {
1264 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1265 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1266
1267 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1268 pi2_src_scratch += 8;
1269
1270 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1271 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1272 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1273
1274 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1275 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1276 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1277 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1278
1279 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1280
1281 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1282 pi2_dst_scratch += 8;
1283 }
1284 }
1285
1286 }
1287 else
1288 {
1289
1290
1291
1292 for(j = 0; j < 2; j++)
1293 {
1294 if(j) //H8B= higher 8 bytes L8B lower 8 bytes
1295 {
1296 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
1297 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
1298 m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
1299 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
1300 }
1301 else
1302 {
1303 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
1304 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
1305 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
1306 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
1307 }
1308 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
1309 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
1310 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
1311 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9
1312
1313
1314 /* o0[0-3] */
1315 {
1316 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1317 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1318 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1319 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1320
1321
1322 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1323 pi2_src_scratch += in_stride;
1324
1325 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
1326 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
1327 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
1328 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
1329
1330 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1331 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
1332 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
1333 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1334 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1335
1336 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1337 m_count = _mm_cvtsi32_si128(i4_shift);
1338 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1339 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1340
1341 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1342 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1343 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1344 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1345
1346 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1347
1348 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1349 pi2_dst_scratch += out_stride;
1350 }
1351
1352 /* o1[0-3] */
1353 {
1354 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1355 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1356 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1357 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1358
1359
1360 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1361 pi2_src_scratch += in_stride;
1362
1363 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
1364 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
1365 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
1366 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
1367
1368 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1369 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
1370 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
1371 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1372 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1373
1374 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1375 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1376 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1377 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1378
1379 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1380
1381 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1382 pi2_dst_scratch += out_stride;
1383 }
1384
1385 /* o2[0-3] */
1386 {
1387 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1388 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1389 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1390 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1391
1392 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1393 pi2_src_scratch += in_stride;
1394
1395 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
1396 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
1397 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
1398 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
1399
1400 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1401 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1402 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
1403 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1404 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1405
1406 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1407 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1408 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1409 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1410
1411 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1412
1413 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1414 pi2_dst_scratch += out_stride;
1415 }
1416
1417 /* o3[0-3] */
1418 {
1419 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1420 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1421 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1422 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1423
1424
1425 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1426 pi2_src_scratch += 8;
1427
1428 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
1429 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
1430 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
1431 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
1432
1433 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
1434 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
1435 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
1436 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1437 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1438
1439 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1440 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1441 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1442 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1443
1444 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1445
1446 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1447 pi2_dst_scratch += 8;
1448 }
1449
1450 /* o4[0-3] */
1451 {
1452 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1453 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1454 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1455 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1456
1457 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1458 pi2_src_scratch -= in_stride;
1459
1460 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
1461 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
1462 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
1463 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
1464
1465 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1466 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1467 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
1468 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1469 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1470
1471 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1472 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1473 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1474 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1475
1476 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1477
1478 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1479 pi2_dst_scratch -= out_stride;
1480 }
1481
1482 /* o5[0-3] */
1483 {
1484 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1485 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1486 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1487 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1488
1489
1490 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1491 pi2_src_scratch -= in_stride;
1492
1493 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
1494 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
1495 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
1496 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
1497
1498 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1499 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
1500 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
1501 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1502 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1503
1504 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1505 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1506 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1507 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1508
1509 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1510
1511 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1512 pi2_dst_scratch -= out_stride;
1513 }
1514
1515 /* o6[0-3] */
1516 {
1517 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1518 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1519 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1520 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1521
1522
1523 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1524 pi2_src_scratch -= in_stride;
1525
1526 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1527 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
1528 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
1529 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
1530
1531
1532 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1533 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1534 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
1535 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1536 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1537
1538 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1539 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1540 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1541 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1542
1543 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1544
1545 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1546 pi2_dst_scratch -= out_stride;
1547 }
1548
1549 /* o7[0-3] */
1550 {
1551 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1552 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1553 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1554 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1555
1556 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1557 pi2_src_scratch += 8;
1558
1559 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1560 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
1561 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
1562 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1563 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1564
1565
1566 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1567 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1568 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1569 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1570
1571 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1572
1573 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1574 pi2_dst_scratch += 8;
1575 }
1576 }
1577 }
1578 }
1579
1580 /* Transpose */
1581 {
1582 WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
1583 WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
1584 WORD32 out_stride = (trans_size << 1);
1585 WORD32 in_stride = (trans_size << 1);
1586 WORD32 j;
1587
1588 for(j = 0; j < 2; j++)
1589 {
1590 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
1591 pi2_src_scratch += in_stride;
1592 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
1593 pi2_src_scratch += in_stride;
1594 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
1595 pi2_src_scratch += in_stride;
1596 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
1597 pi2_src_scratch += 8;
1598 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
1599 pi2_src_scratch -= in_stride;
1600 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
1601 pi2_src_scratch -= in_stride;
1602 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
1603 pi2_src_scratch -= in_stride;
1604 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
1605 pi2_src_scratch += 8;
1606
1607 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
1608 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
1609
1610 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
1611 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
1612
1613 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
1614 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
1615
1616 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
1617 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
1618
1619
1620 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
1621 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
1622
1623 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
1624 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
1625
1626 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
1627 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
1628
1629 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
1630 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
1631
1632
1633 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
1634 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
1635
1636 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
1637 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
1638
1639 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
1640 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
1641
1642 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
1643 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
1644
1645 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1646 pi2_dst_scratch += out_stride;
1647 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
1648 pi2_dst_scratch += out_stride;
1649 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
1650 pi2_dst_scratch += out_stride;
1651 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
1652 pi2_dst_scratch += 8;
1653 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
1654 pi2_dst_scratch -= out_stride;
1655 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
1656 pi2_dst_scratch -= out_stride;
1657 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
1658 pi2_dst_scratch -= out_stride;
1659 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
1660 pi2_dst_scratch += 8;
1661 }
1662 }
1663 }
1664
1665 if(zero_last8_cols_stg1)
1666 {
1667 WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
1668 WORD32 out_stride = (trans_size << 1);
1669 WORD32 j;
1670
1671 m_temp_reg_40 = _mm_setzero_si128();
1672 for(j = 0; j < 2; j++)
1673 {
1674 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1675 pi2_dst_scratch += out_stride;
1676 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1677 pi2_dst_scratch += out_stride;
1678 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1679 pi2_dst_scratch += out_stride;
1680 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1681 pi2_dst_scratch += 8;
1682 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1683 pi2_dst_scratch -= out_stride;
1684 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1685 pi2_dst_scratch -= out_stride;
1686 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1687 pi2_dst_scratch -= out_stride;
1688 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1689 pi2_dst_scratch += 8;
1690 }
1691 }
1692
1693
1694
1695
1696 /* Stage 2 */
1697 for(i = 0; i < 2; i++)
1698 {
1699 WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
1700 WORD32 stride = (trans_size);
1701 MEM_ALIGN16 WORD16 temp_array[256];
1702
1703 i4_shift = IT_SHIFT_STAGE_2;
1704
1705 if(zero_last12_rows_stg2)
1706 {
1707 /* eeo */
1708 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1709 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1710 {
1711 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
1712
1713 pi2_src_temp += (stride * 9);
1714
1715 if(!i)
1716 {
1717 pi2_src_temp += (stride * 6 + 8);
1718 }
1719 else
1720 {
1721 pi2_src_temp += (stride * 2 + 8);
1722 }
1723
1724 pi2_src_temp -= (stride * 9);
1725
1726 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
1727
1728 m_temp_reg_20 = _mm_setzero_si128();
1729 m_temp_reg_22 = _mm_setzero_si128();
1730
1731 m_temp_reg_21 = _mm_setzero_si128();
1732 m_temp_reg_23 = _mm_setzero_si128();
1733 }
1734
1735 /* eee */
1736 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
1737 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
1738 {
1739 /* Loading coeff and src for use in next block */
1740
1741 /* Loading coeff and src for use in next block */
1742 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
1743
1744 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
1745
1746 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
1747
1748 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
1749 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
1750
1751 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
1752 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
1753
1754 m_temp_reg_26 = m_temp_reg_24;
1755 m_temp_reg_27 = m_temp_reg_25;
1756 /* */
1757
1758 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
1759 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
1760 }
1761
1762 /* eo */
1763 {
1764 WORD16 *pi2_scratch = temp_array;
1765 WORD32 out_stride = 8;
1766
1767
1768 /* eo0[0-3] */
1769 {
1770 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1771
1772 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1773
1774 /* e[0][0-3] stored in pu1_dst[0] */
1775 /* e[7][0-3] stored in pu1_dst[1] */
1776 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
1777 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
1778
1779 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1780 pi2_scratch += out_stride;
1781 _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
1782 pi2_scratch += out_stride;
1783 }
1784
1785 /* eo0[4-7] */
1786 {
1787 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1788
1789 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1790
1791 /* e[0][4-7] stored in pu1_dst[2] */
1792 /* e[7][4-7] stored in pu1_dst[3] */
1793
1794 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
1795 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
1796
1797 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1798 pi2_scratch += out_stride;
1799 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1800 pi2_scratch += out_stride;
1801
1802 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
1803 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
1804
1805 }
1806
1807 /* eo1[0-3] */
1808 {
1809 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1810
1811 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
1812
1813 /* e[1][0-3] stored in pu1_dst[4] */
1814 /* e[6][0-3] stored in pu1_dst[5] */
1815 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
1816 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
1817
1818 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1819 pi2_scratch += out_stride;
1820 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1821 pi2_scratch += out_stride;
1822 }
1823
1824 /* eo1[4-7] */
1825 {
1826 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1827
1828 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
1829
1830 /* e[1][4-7] stored in pu1_dst[6]*/
1831 /* e[6][4-7] stored in pu1_dst[7] */
1832 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
1833 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
1834
1835 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1836 pi2_scratch += out_stride;
1837 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1838 pi2_scratch += out_stride;
1839
1840 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
1841
1842 }
1843
1844 /* eo2[0-3] */
1845 {
1846 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1847
1848 /* e[2][0-3] stored in pu1_dst[8]*/
1849 /* e[5][0-3] stored in pu1_dst[9] */
1850 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
1851 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
1852
1853 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1854 pi2_scratch += out_stride;
1855 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1856 pi2_scratch += out_stride;
1857 }
1858
1859 /* eo2[4-7] */
1860 {
1861 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1862
1863 /* e[2][4-7] stored in pu1_dst[10]*/
1864 /* e[5][4-7] stored in pu1_dst[11] */
1865 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
1866 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
1867
1868 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1869 pi2_scratch += out_stride;
1870 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1871 pi2_scratch += out_stride;
1872
1873 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
1874 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
1875
1876 }
1877
1878 /* eo3[0-3] */
1879 {
1880 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1881
1882 /* e[3][0-3] stored in pu1_dst[12]*/
1883 /* e[4][0-3] stored in pu1_dst[13] */
1884 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
1885 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
1886
1887 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1888 pi2_scratch += out_stride;
1889 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1890 pi2_scratch += out_stride;
1891 }
1892
1893 /* eo3[4-7] */
1894 {
1895 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1896
1897 /* e[3][4-7] stored in pu1_dst[14]*/
1898 /* e[4][4-7] stored in pu1_dst[15] */
1899 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
1900 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
1901
1902 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1903 pi2_scratch += out_stride;
1904 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1905 pi2_scratch += out_stride;
1906 }
1907
1908 }
1909 }
1910 else if(zero_last8_rows_stg2)
1911 {
1912 /* eeo */
1913 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1914 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1915 {
1916
1917 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
1918 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
1919
1920 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
1921 pi2_src_temp += (stride);
1922 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
1923 pi2_src_temp += (stride * 8);
1924
1925 if(!i)
1926 {
1927 pi2_src_temp += (stride * 6 + 8);
1928 }
1929 else
1930 {
1931 pi2_src_temp += (stride * 2 + 8);
1932 }
1933
1934 pi2_src_temp -= (stride * 8);
1935 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
1936 pi2_src_temp -= (stride);
1937 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
1938
1939
1940 m_temp_reg_76 = _mm_setzero_si128();
1941
1942
1943 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
1944 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
1945
1946 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
1947 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
1948
1949 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1950 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1951
1952 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1953 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1954 }
1955
1956 /* eee */
1957 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
1958 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
1959 {
1960 /* Loading coeff and src for use in next block */
1961
1962
1963 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
1964
1965 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
1966
1967 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
1968 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
1969 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
1970
1971 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
1972
1973 m_temp_reg_26 = m_temp_reg_24;
1974 m_temp_reg_27 = m_temp_reg_25;
1975
1976 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1977 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1978 }
1979
1980 /* eo */
1981 {
1982 WORD16 *pi2_scratch = temp_array;
1983 WORD32 out_stride = 8;
1984
1985
1986 /* eo0[0-3] */
1987 {
1988 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1989
1990 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1991 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
1992 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
1993
1994 /* e[0][0-3] stored in pu1_dst[0] */
1995 /* e[7][0-3] stored in pu1_dst[1] */
1996 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1997 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1998
1999 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2000 pi2_scratch += out_stride;
2001 _mm_store_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
2002 pi2_scratch += out_stride;
2003 }
2004
2005 /* eo0[4-7] */
2006 {
2007 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
2008
2009 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2010 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
2011 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
2012
2013 /* e[0][4-7] stored in pu1_dst[2] */
2014 /* e[7][4-7] stored in pu1_dst[3] */
2015
2016 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2017 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2018
2019 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2020 pi2_scratch += out_stride;
2021 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2022 pi2_scratch += out_stride;
2023
2024 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2025 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
2026
2027 }
2028
2029 /* eo1[0-3] */
2030 {
2031 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2032
2033 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2034 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
2035 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
2036
2037 /* e[1][0-3] stored in pu1_dst[4] */
2038 /* e[6][0-3] stored in pu1_dst[5] */
2039 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
2040 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
2041
2042 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2043 pi2_scratch += out_stride;
2044 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2045 pi2_scratch += out_stride;
2046 }
2047
2048 /* eo1[4-7] */
2049 {
2050 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
2051
2052 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2053 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
2054 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
2055
2056 /* e[1][4-7] stored in pu1_dst[6]*/
2057 /* e[6][4-7] stored in pu1_dst[7] */
2058 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
2059 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
2060
2061 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2062 pi2_scratch += out_stride;
2063 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2064 pi2_scratch += out_stride;
2065
2066 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
2067
2068 }
2069
2070 /* eo2[0-3] */
2071 {
2072 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2073
2074 /* e[2][0-3] stored in pu1_dst[8]*/
2075 /* e[5][0-3] stored in pu1_dst[9] */
2076 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
2077 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
2078
2079 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2080 pi2_scratch += out_stride;
2081 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2082 pi2_scratch += out_stride;
2083 }
2084
2085 /* eo2[4-7] */
2086 {
2087 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
2088
2089 /* e[2][4-7] stored in pu1_dst[10]*/
2090 /* e[5][4-7] stored in pu1_dst[11] */
2091 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
2092 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
2093
2094 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2095 pi2_scratch += out_stride;
2096 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2097 pi2_scratch += out_stride;
2098
2099 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
2100 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
2101
2102 }
2103
2104 /* eo3[0-3] */
2105 {
2106 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2107
2108 /* e[3][0-3] stored in pu1_dst[12]*/
2109 /* e[4][0-3] stored in pu1_dst[13] */
2110 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
2111 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
2112
2113 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2114 pi2_scratch += out_stride;
2115 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2116 pi2_scratch += out_stride;
2117 }
2118
2119 /* eo3[4-7] */
2120 {
2121 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
2122
2123 /* e[3][4-7] stored in pu1_dst[14]*/
2124 /* e[4][4-7] stored in pu1_dst[15] */
2125 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
2126 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
2127
2128 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2129 pi2_scratch += out_stride;
2130 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2131 pi2_scratch += out_stride;
2132 }
2133 }
2134 }
2135
2136 else
2137 {
2138 /* eeo */
2139 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
2140 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
2141 {
2142
2143
2144 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
2145 pi2_src_temp += (stride);
2146 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
2147 pi2_src_temp += (stride * 7);
2148 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
2149 pi2_src_temp += (stride);
2150 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
2151 if(!i)
2152 {
2153 pi2_src_temp += (stride * 6 + 8);
2154 }
2155 else
2156 {
2157 pi2_src_temp += (stride * 2 + 8);
2158 }
2159 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
2160 pi2_src_temp -= (stride);
2161 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
2162 pi2_src_temp -= (stride * 7);
2163 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
2164 pi2_src_temp -= (stride);
2165 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
2166
2167 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
2168 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
2169
2170 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
2171 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
2172
2173 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2174 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2175
2176 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2177 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2178
2179
2180 }
2181
2182 /* eee */
2183 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
2184 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
2185 {
2186 /* Loading coeff and src for use in next block */
2187 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64
2188 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
2189
2190 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
2191 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
2192
2193 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2194 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
2195
2196 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2197 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
2198
2199 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
2200 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
2201
2202 }
2203
2204 /* eo */
2205 {
2206 WORD16 *pi2_scratch = temp_array;
2207 WORD32 out_stride = 8;
2208
2209
2210
2211 /* eo0[0-3] */
2212 {
2213 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
2214 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
2215 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
2216 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
2217
2218
2219 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2220 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
2221
2222
2223 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2224 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
2225 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
2226
2227
2228 /* e[0][0-3] stored in pi2_tmp[0][0-7] */
2229 /* e[7][0-3] stored in pi2_tmp[0][8-15] */
2230 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
2231 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
2232 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2233 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2234
2235 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2236 pi2_scratch += out_stride;
2237 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2238 pi2_scratch += out_stride;
2239
2240
2241 }
2242
2243 /* eo0[4-7] */
2244 {
2245
2246 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2247 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
2248
2249 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2250 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
2251 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
2252
2253 /* e[0][4-7] stored in pi2_tmp[1][0-7] */
2254 /* e[7][4-7] stored in pi2_tmp[1][8-15] */
2255 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2256 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2257 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2258 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2259
2260 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2261 pi2_scratch += out_stride;
2262 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2263 pi2_scratch += out_stride;
2264
2265 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2266 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
2267
2268 }
2269
2270 /* eo1[0-3] */
2271 {
2272 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2273 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
2274
2275 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2276 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
2277 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
2278
2279 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
2280 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
2281 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
2282 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
2283 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
2284 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
2285
2286 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2287 pi2_scratch += out_stride;
2288 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2289 pi2_scratch += out_stride;
2290
2291 }
2292
2293 /* eo1[4-7] */
2294 {
2295 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
2296 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2297
2298 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2299 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
2300 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
2301
2302 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
2303 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
2304 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
2305 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
2306 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
2307 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
2308
2309 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2310 pi2_scratch += out_stride;
2311 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2312 pi2_scratch += out_stride;
2313 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
2314 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
2315 }
2316
2317 /* eo2[0-3] */
2318 {
2319 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2320 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
2321
2322 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
2323 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
2324 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
2325 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
2326 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2327 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2328
2329 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2330 pi2_scratch += out_stride;
2331 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2332 pi2_scratch += out_stride;
2333 }
2334
2335 /* eo2[4-7] */
2336 {
2337 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2338 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
2339
2340 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
2341 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
2342 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
2343 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
2344 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2345 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2346
2347 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2348 pi2_scratch += out_stride;
2349 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2350 pi2_scratch += out_stride;
2351
2352 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
2353 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
2354
2355 }
2356
2357 /* eo3[0-3] */
2358 {
2359 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2360 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
2361
2362 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
2363 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
2364 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
2365 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
2366 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2367 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2368
2369
2370 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2371 pi2_scratch += out_stride;
2372 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2373 pi2_scratch += out_stride;
2374 }
2375
2376 /* eo3[4-7] */
2377 {
2378 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
2379 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2380
2381 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
2382 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
2383 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
2384 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
2385 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2386 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2387
2388 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2389 pi2_scratch += out_stride;
2390 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2391 pi2_scratch += out_stride;
2392 }
2393 }
2394 }
2395
2396 if(zero_last12_rows_stg2)
2397 {
2398 /* o & stage 2 pre-transposed out */
2399 {
2400 WORD32 j;
2401 WORD16 *pi2_src_scratch = temp_array;
2402 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2403 WORD32 out_stride = (trans_size);
2404 WORD32 in_stride = (8) * 4;
2405
2406 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2407
2408 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2409
2410 pi2_src_temp += (stride * 9);
2411
2412 if(0 == i)
2413 {
2414 pi2_src_temp -= (stride * 2 - 8);
2415 }
2416 else
2417 {
2418 pi2_src_temp -= (stride * 6 - 8);
2419 }
2420 pi2_src_temp -= (stride * 9);
2421
2422 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2423
2424
2425 for(j = 0; j < 2; j++)
2426 {
2427 if(j)
2428 {
2429 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2430 }
2431 else
2432 {
2433 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2434 }
2435 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2436
2437 /* o0[0-3] */
2438 {
2439 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2440
2441 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2442 pi2_src_scratch += in_stride;
2443
2444 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2445
2446 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2447 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2448
2449
2450 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2451 m_count = _mm_cvtsi32_si128(i4_shift);
2452 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2453
2454
2455 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2456 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2457 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2458 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2459
2460 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2461
2462 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2463 pi2_dst_scratch += out_stride;
2464 }
2465
2466 /* o1[0-3] */
2467 {
2468 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2469
2470 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2471 pi2_src_scratch += in_stride;
2472
2473 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
2474
2475 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2476 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2477
2478 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2479 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2480 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2481 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2482
2483 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2484
2485 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2486 pi2_dst_scratch += ((!i) * out_stride + 8);
2487 }
2488
2489 /* o2[0-3] */
2490 {
2491 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2492
2493 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2494 pi2_src_scratch += in_stride;
2495
2496 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
2497
2498 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2499 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2500
2501 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2502 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2503 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2504 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2505
2506 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2507
2508 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2509 pi2_dst_scratch += out_stride;
2510 }
2511
2512 /* o3[0-3] */
2513 {
2514 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2515
2516 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2517 pi2_src_scratch += 8;
2518
2519 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
2520
2521 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2522 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2523
2524 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2525 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2526 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2527 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2528
2529 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2530
2531 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2532 pi2_dst_scratch += (i * out_stride + 8);
2533 }
2534
2535 /* o4[0-3] */
2536 {
2537 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2538
2539 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2540 pi2_src_scratch -= in_stride;
2541
2542 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
2543
2544 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2545 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2546
2547
2548 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2549 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2550 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2551 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2552
2553 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2554
2555 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2556 pi2_dst_scratch += out_stride;
2557 }
2558
2559 /* o5[0-3] */
2560 {
2561 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2562
2563 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2564 pi2_src_scratch -= in_stride;
2565
2566 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
2567
2568 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2569 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2570
2571
2572 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2573 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2574 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2575 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2576
2577 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2578
2579 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2580 pi2_dst_scratch += ((!i) * out_stride + 8);
2581 }
2582
2583 /* o6[0-3] */
2584 {
2585 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2586
2587 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2588 pi2_src_scratch -= in_stride;
2589
2590 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
2591
2592 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2593 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2594
2595 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2596 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2597 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2598 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2599
2600 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2601
2602 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2603 pi2_dst_scratch += out_stride;
2604 }
2605
2606 /* o7[0-3] */
2607 {
2608 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2609
2610 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2611 pi2_src_scratch += 8;
2612
2613 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2614 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2615
2616 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2617 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2618 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2619 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2620
2621 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2622
2623 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2624 pi2_dst_scratch += (i * out_stride + 8);
2625 }
2626
2627
2628 }
2629 }
2630 }
2631 else if(zero_last8_rows_stg2)
2632 {
2633 /* o & stage 2 pre-transposed out */
2634 {
2635 WORD32 j;
2636 WORD16 *pi2_src_scratch = temp_array;
2637 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2638 WORD32 out_stride = (trans_size);
2639 WORD32 in_stride = (8) * 4;
2640
2641 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2642
2643
2644 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2645 pi2_src_temp += (stride);
2646 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
2647 pi2_src_temp += (stride * 8);
2648
2649 if(0 == i)
2650 {
2651 pi2_src_temp -= (stride * 2 - 8);
2652 }
2653 else
2654 {
2655 pi2_src_temp -= (stride * 6 - 8);
2656 }
2657
2658 pi2_src_temp -= (stride * 8);
2659 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
2660 pi2_src_temp -= (stride);
2661 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2662
2663
2664 for(j = 0; j < 2; j++)
2665 {
2666 if(j)
2667 {
2668 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2669 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
2670 }
2671 else
2672 {
2673 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2674 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
2675 }
2676 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2677 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
2678
2679 /* o0[0-3] */
2680 {
2681 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2682 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2683
2684 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2685 pi2_src_scratch += in_stride;
2686
2687 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2688 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
2689
2690 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2691 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2692 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2693
2694
2695 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2696 m_count = _mm_cvtsi32_si128(i4_shift);
2697
2698 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2699
2700 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2701 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2702 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2703 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2704
2705 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2706
2707 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2708 pi2_dst_scratch += out_stride;
2709 }
2710
2711 /* o1[0-3] */
2712 {
2713 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2714 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2715
2716 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2717 pi2_src_scratch += in_stride;
2718
2719 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
2720 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
2721
2722 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2723 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2724 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2725
2726 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2727 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2728 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2729 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2730
2731 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2732
2733 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2734 pi2_dst_scratch += ((!i) * out_stride + 8);
2735 }
2736
2737 /* o2[0-3] */
2738 {
2739 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2740 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2741
2742 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2743 pi2_src_scratch += in_stride;
2744
2745 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
2746 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
2747
2748 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
2749 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2750 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2751
2752 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2753 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2754 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2755 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2756
2757 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2758
2759 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2760 pi2_dst_scratch += out_stride;
2761 }
2762
2763 /* o3[0-3] */
2764 {
2765 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2766 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2767
2768 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2769 pi2_src_scratch += 8;
2770
2771 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
2772 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
2773
2774 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
2775 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2776 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2777
2778 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2779 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2780 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2781 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2782
2783 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2784
2785 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2786 pi2_dst_scratch += (i * out_stride + 8);
2787 }
2788
2789 /* o4[0-3] */
2790 {
2791 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2792 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2793
2794 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2795 pi2_src_scratch -= in_stride;
2796
2797 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
2798 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
2799
2800 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
2801 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2802 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2803
2804
2805 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2806 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2807 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2808 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2809
2810 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2811
2812 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2813 pi2_dst_scratch += out_stride;
2814 }
2815
2816 /* o5[0-3] */
2817 {
2818 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2819 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2820
2821 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2822 pi2_src_scratch -= in_stride;
2823
2824 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
2825 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
2826
2827 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2828 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2829 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2830
2831
2832 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2833 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2834 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2835 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2836
2837 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2838
2839 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2840 pi2_dst_scratch += ((!i) * out_stride + 8);
2841 }
2842
2843 /* o6[0-3] */
2844 {
2845 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2846 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2847
2848 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2849 pi2_src_scratch -= in_stride;
2850
2851 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
2852 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
2853
2854 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2855 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2856 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2857
2858 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2859 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2860 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2861 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2862
2863 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2864
2865 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2866 pi2_dst_scratch += out_stride;
2867 }
2868
2869 /* o7[0-3] */
2870 {
2871 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2872 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2873
2874 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2875 pi2_src_scratch += 8;
2876
2877 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2878 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2879 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2880
2881 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2882 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2883 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2884 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2885
2886 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2887
2888 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2889 pi2_dst_scratch += (i * out_stride + 8);
2890 }
2891 }
2892 }
2893 }
2894 else
2895 {
2896 /* o & stage 2 pre-transposed out */
2897 {
2898 WORD32 j;
2899 WORD16 *pi2_src_scratch = temp_array;
2900 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2901 WORD32 out_stride = (trans_size);
2902 WORD32 in_stride = (8) * 4;
2903
2904 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2905
2906
2907 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2908 pi2_src_temp += (stride);
2909 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
2910 pi2_src_temp += (stride * 7);
2911 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
2912 pi2_src_temp += (stride);
2913 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
2914 if(0 == i)
2915 {
2916 pi2_src_temp -= (stride * 2 - 8);
2917 }
2918 else
2919 {
2920 pi2_src_temp -= (stride * 6 - 8);
2921 }
2922 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
2923 pi2_src_temp -= (stride);
2924 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
2925 pi2_src_temp -= (stride * 7);
2926 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
2927 pi2_src_temp -= (stride);
2928 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2929
2930
2931 for(j = 0; j < 2; j++)
2932 {
2933
2934 if(j) //H8B= higher 8 bytes L8B lower 8 bytes
2935 {
2936 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2937 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
2938 m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
2939 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
2940 }
2941 else
2942 {
2943 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2944 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
2945 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
2946 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
2947 }
2948 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2949 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
2950 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
2951 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9
2952
2953
2954 /* o0[0-3] */
2955 {
2956 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2957 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2958 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2959 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2960
2961
2962 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2963 pi2_src_scratch += in_stride;
2964
2965 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2966 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
2967 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
2968 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
2969
2970 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2971 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2972 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2973 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2974 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2975
2976 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2977 m_count = _mm_cvtsi32_si128(i4_shift);
2978 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2979
2980 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2981 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2982 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2983 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2984
2985 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2986
2987 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2988 pi2_dst_scratch += out_stride;
2989 }
2990
2991 /* o1[0-3] */
2992 {
2993 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2994 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2995 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
2996 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
2997
2998
2999 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3000 pi2_src_scratch += in_stride;
3001
3002 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
3003 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
3004 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
3005 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
3006
3007 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3008 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
3009 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
3010 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3011 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3012
3013 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3014 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3015 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3016 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3017
3018 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3019
3020 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3021 pi2_dst_scratch += ((!i) * out_stride + 8);
3022 }
3023
3024 /* o2[0-3] */
3025 {
3026 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3027 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3028 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3029 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3030
3031 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3032 pi2_src_scratch += in_stride;
3033
3034 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
3035 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
3036 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
3037 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
3038
3039 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3040 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3041 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
3042 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3043 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3044
3045 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3046 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3047 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3048 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3049
3050 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3051
3052 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3053 pi2_dst_scratch += out_stride;
3054 }
3055
3056 /* o3[0-3] */
3057 {
3058 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3059 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3060 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3061 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3062
3063
3064 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3065 pi2_src_scratch += 8;
3066
3067 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
3068 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
3069 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
3070 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
3071
3072 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
3073 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
3074 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
3075 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3076 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3077
3078 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3079 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3080 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3081 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3082
3083 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3084
3085 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3086 pi2_dst_scratch += (i * out_stride + 8);
3087 }
3088
3089 /* o4[0-3] */
3090 {
3091 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3092 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3093 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3094 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3095
3096 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3097 pi2_src_scratch -= in_stride;
3098
3099 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
3100 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
3101 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
3102 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
3103
3104 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3105 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3106 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
3107 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3108 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3109
3110 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3111 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3112 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3113 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3114
3115 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3116
3117 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3118 pi2_dst_scratch += out_stride;
3119 }
3120
3121 /* o5[0-3] */
3122 {
3123 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3124 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3125 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3126 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3127
3128
3129 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3130 pi2_src_scratch -= in_stride;
3131
3132 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
3133 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
3134 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
3135 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
3136
3137 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3138 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
3139 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
3140 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3141 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3142
3143 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3144 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3145 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3146 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3147
3148 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3149
3150 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3151 pi2_dst_scratch += ((!i) * out_stride + 8);
3152 }
3153
3154 /* o6[0-3] */
3155 {
3156 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3157 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3158 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3159 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3160
3161
3162 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3163 pi2_src_scratch -= in_stride;
3164
3165 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
3166 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
3167 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
3168 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
3169
3170
3171 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3172 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3173 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3174 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3175 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3176
3177 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3178 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3179 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3180 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3181
3182 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3183
3184 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3185 pi2_dst_scratch += out_stride;
3186 }
3187
3188 /* o7[0-3] */
3189 {
3190 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3191 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3192 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3193 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3194
3195 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3196 pi2_src_scratch += 8;
3197
3198 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3199 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
3200 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
3201 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3202 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3203
3204
3205 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3206 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3207 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3208 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3209
3210 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3211
3212 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3213 pi2_dst_scratch += (i * out_stride + 8);
3214 }
3215
3216 }
3217 }
3218 }
3219 }
3220
3221 /* Transpose */
3222 {
3223 WORD16 *pi2_src_scratch;
3224 UWORD8 *pu1_pred_temp = pu1_pred;
3225 WORD32 out_stride = dst_strd;
3226 WORD32 in_stride = trans_size;
3227 WORD32 j;
3228 m_temp_reg_1 = _mm_setzero_si128();
3229 for(i = 0; i < 2; i++)
3230 {
3231 pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
3232
3233 for(j = 0; j < 2; j++)
3234 {
3235 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
3236 pi2_src_scratch += in_stride;
3237 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
3238 pi2_src_scratch += ((!i) * in_stride + 8);
3239 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
3240 pi2_src_scratch += (in_stride);
3241 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
3242 pi2_src_scratch += (i * in_stride + 8);
3243 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
3244 pi2_src_scratch += in_stride;
3245 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
3246 pi2_src_scratch += ((!i) * in_stride + 8);
3247 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
3248 pi2_src_scratch += in_stride;
3249 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
3250 pi2_src_scratch += (i * in_stride + 8);
3251
3252 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
3253 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
3254
3255 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
3256 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
3257
3258 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
3259 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
3260
3261 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
3262 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
3263
3264
3265 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
3266 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
3267
3268 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
3269 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
3270
3271 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
3272 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
3273
3274 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
3275 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
3276
3277
3278 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
3279 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3280
3281 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3282 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3283
3284 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
3285 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
3286 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
3287
3288 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
3289 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3290 pu1_dst += out_stride;
3291 pu1_pred_temp += pred_strd;
3292
3293 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
3294 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3295
3296 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3297 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3298
3299 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
3300 m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
3301 m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
3302
3303 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
3304 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3305 pu1_dst += out_stride;
3306 pu1_pred_temp += pred_strd;
3307
3308 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
3309 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3310
3311 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3312 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3313
3314 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
3315 m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
3316 m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
3317
3318 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
3319 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3320 pu1_dst += out_stride;
3321 pu1_pred_temp += pred_strd;
3322
3323 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
3324 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3325
3326 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3327 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3328
3329 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
3330 m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
3331 m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
3332
3333 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
3334 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3335 pu1_dst += out_stride;
3336 pu1_pred_temp += pred_strd;
3337 }
3338 }
3339 }
3340 }
3341