1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_16x16_itrans_recon_x86_intr.c
22 *
23 * @brief
24 * Contains function definitions for inverse
25 * transform and reconstruction for 16x16.
26 *
27 * @author
28 * 100470
29 * 100592 (edited by)
30 *
31 * @par List of Functions:
32 * - ihevc_itrans_recon_16x16_sse42()
33 *
34 * @remarks
35 * None
36 *
37 *******************************************************************************
38 */
39 #include <stdio.h>
40 #include <string.h>
41 #include "ihevc_typedefs.h"
42 #include "ihevc_macros.h"
43 #include "ihevc_platform_macros.h"
44 #include "ihevc_defs.h"
45 #include "ihevc_trans_tables.h"
46 #include "ihevc_itrans_recon.h"
47 #include "ihevc_func_selector.h"
48 #include "ihevc_trans_macros.h"
49
50 #include <immintrin.h>
51 #include <emmintrin.h>
52 #include <smmintrin.h>
53 #include <tmmintrin.h>
54
55 /**
56 *******************************************************************************
57 *
58 * @brief
59 * This function performs inverse quantization, inverse transform and
60 * reconstruction for 16x16 input block
61 *
62 * @par Description:
63 * Performs inverse quantization , inverse transform and adds the
64 * prediction data and clips output to 8 bit
65 *
66 * @param[in] pi2_src
67 * Input 16x16 coefficients
68 *
69 * @param[in] pi2_tmp
70 * Temporary 16x16 buffer for storing inverse
71 * transform 1st stage output
72 *
73 * @param[in] pu1_pred
74 * Prediction 16x16 block
75 *
76 * @param[in] pi2_dequant_coeff
77 * Dequant Coeffs
78 *
79 * @param[out] pu1_dst
80 * Output 16x16 block
81 *
82 * @param[in] qp_div
83 * Quantization parameter / 6
84 *
85 * @param[in] qp_rem
86 * Quantization parameter % 6
87 *
88 * @param[in] src_strd
89 * Input stride
90 *
91 * @param[in] pred_strd
92 * Prediction stride
93 *
94 * @param[in] dst_strd
95 * Output Stride
96 *
97 * @param[in] zero_cols
98 * Zero columns in pi2_src
99 *
100 * @returns Void
101 *
102 * @remarks
103 * None
104 *
105 *******************************************************************************
106 */
107
ihevc_itrans_recon_16x16_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)108 void ihevc_itrans_recon_16x16_sse42(WORD16 *pi2_src,
109 WORD16 *pi2_tmp,
110 UWORD8 *pu1_pred,
111 UWORD8 *pu1_dst,
112 WORD32 src_strd,
113 WORD32 pred_strd,
114 WORD32 dst_strd,
115 WORD32 zero_cols,
116 WORD32 zero_rows)
117 {
118 __m128i m_temp_reg_0;
119 __m128i m_temp_reg_1;
120 __m128i m_temp_reg_10;
121 __m128i m_temp_reg_11;
122 __m128i m_temp_reg_12;
123 __m128i m_temp_reg_13;
124 __m128i m_temp_reg_14;
125 __m128i m_temp_reg_20;
126 __m128i m_temp_reg_21;
127 __m128i m_temp_reg_22;
128 __m128i m_temp_reg_23;
129 __m128i m_temp_reg_24;
130 __m128i m_temp_reg_25;
131 __m128i m_temp_reg_26;
132 __m128i m_temp_reg_27;
133 __m128i m_temp_reg_30;
134 __m128i m_temp_reg_31;
135 __m128i m_temp_reg_32;
136 __m128i m_temp_reg_33;
137 __m128i m_temp_reg_34;
138 __m128i m_temp_reg_35;
139 __m128i m_temp_reg_36;
140 __m128i m_temp_reg_37;
141 __m128i m_temp_reg_40;
142 __m128i m_temp_reg_41;
143 __m128i m_temp_reg_42;
144 __m128i m_temp_reg_43;
145 __m128i m_temp_reg_44;
146 __m128i m_temp_reg_45;
147 __m128i m_temp_reg_46;
148 __m128i m_temp_reg_47;
149
150 __m128i m_temp_reg_70;
151 __m128i m_temp_reg_71;
152 __m128i m_temp_reg_72;
153 __m128i m_temp_reg_73;
154 __m128i m_temp_reg_74;
155 __m128i m_temp_reg_75;
156 __m128i m_temp_reg_76;
157 __m128i m_temp_reg_77;
158 __m128i m_rdng_factor;
159 __m128i m_count;
160 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
161 __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
162
163 WORD32 i;
164
165 WORD32 zero_last8_cols_stg1;
166 WORD32 zero_last8_rows_stg1;
167 WORD32 zero_last12_rows_stg1;
168 WORD32 zero_last12_rows_stg2;
169 WORD32 zero_last8_rows_stg2;
170
171 WORD32 loop = 0;
172
173 WORD32 i4_shift = IT_SHIFT_STAGE_1;
174 WORD32 trans_size = TRANS_SIZE_16;
175
176 /* Following 3 instructions replicates the value in the */
177 /* lower 16 bits of m_add_iq in the entire register */
178
179 /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
180
181 zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0;
182 zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0;
183 zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0;
184
185 zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0;
186 zero_last8_rows_stg2 = zero_last8_cols_stg1;
187
188 if(zero_last8_cols_stg1)
189 {
190 loop = 1;
191 }
192 else
193 loop = 2;
194
195 /* i = 0 => lower 8 samples */
196 /* i = 1 => higher 8 samples */
197 for(i = 0; i < loop; i++)
198 {
199 {
200 WORD32 sample_half_index = i << 3;
201 WORD16 *pi2_tmp_src = pi2_src + sample_half_index;
202 WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
203
204 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
205 pi2_tmp_src += (src_strd << 1);
206 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
207 pi2_tmp_src += (src_strd << 1);
208 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
209 pi2_tmp_src += (src_strd << 1);
210 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
211 pi2_tmp_src += (src_strd << 1);
212 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
213 pi2_tmp_src += (src_strd << 1);
214 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
215 pi2_tmp_src += (src_strd << 1);
216 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
217 pi2_tmp_src += (src_strd << 1);
218 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
219 pi2_tmp_src += (src_strd << 1);
220
221
222
223
224 /* If last 12 rows are zero : Rishab */
225 if(zero_last12_rows_stg1)
226 {
227
228 /* eee */
229 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
230 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
231 {
232 /* Loading coeff and src for use in next block */
233
234 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign
235 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
236
237 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
238
239 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
240
241 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
242
243 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
244
245 m_temp_reg_26 = m_temp_reg_24;
246 m_temp_reg_27 = m_temp_reg_25;
247 }
248
249 /* eo */
250
251 /* eo0[0-3] */
252 {
253 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
254 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
255
256 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
257
258 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
259
260 /* e[0][0-3] stored in pi2_tmp[0][0-7] */
261 /* e[7][0-3] stored in pi2_tmp[0][8-15] */
262 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
263 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
264
265 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
266 pi2_scratch += 8;
267 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
268 pi2_scratch += 8;
269
270 }
271
272
273 /* eo0[4-7] */
274 {
275 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
276
277 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
278
279 /* e[0][4-7] stored in pi2_tmp[1][0-7] */
280 /* e[7][4-7] stored in pi2_tmp[1][8-15] */
281 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
282 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
283
284 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
285 pi2_scratch += 8;
286 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
287 pi2_scratch += 8;
288
289 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
290 }
291
292 /* eo1[0-3] */
293 {
294 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
295
296 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
297
298 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
299 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
300 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
301 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
302
303 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
304 pi2_scratch += 8;
305 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
306 pi2_scratch += 8;
307 }
308
309 /* eo1[4-7] */
310 {
311 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
312
313 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
314
315 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
316 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
317 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
318 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
319
320 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
321 pi2_scratch += 8;
322 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
323 pi2_scratch += 8;
324
325 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
326
327 }
328
329 /* eo2[0-3] */
330 {
331 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
332
333 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
334 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
335 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
336 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
337
338 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
339 pi2_scratch += 8;
340 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
341 pi2_scratch += 8;
342
343 }
344
345 /* eo2[4-7] */
346 {
347 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
348
349 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
350 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
351 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
352 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
353
354
355 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
356 pi2_scratch += 8;
357 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
358 pi2_scratch += 8;
359
360 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
361 }
362
363 /* eo3[0-3] */
364 {
365 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
366
367 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
368 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
369 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
370 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
371
372 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
373 pi2_scratch += 8;
374 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
375 pi2_scratch += 8;
376 }
377
378 /* eo3[4-7] */
379 {
380 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
381
382 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
383 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
384 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
385 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
386
387 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
388 pi2_scratch += 8;
389 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
390 pi2_scratch += 8;
391 }
392 }
393 /* If last 8 rows are zero : Rishab */
394 else if(zero_last8_rows_stg1)
395 {
396 /* eeo */
397 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
398 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
399 {
400 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
401 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
402
403 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
404 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
405
406 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
407 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
408
409 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
410 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
411
412 }
413
414 /* eee */
415 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
416 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
417 {
418 /* Loading coeff and src for use in next block */
419 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs
420 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
421
422 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
423
424 //m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
425
426 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
427
428 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
429
430 m_temp_reg_26 = m_temp_reg_24;
431 m_temp_reg_27 = m_temp_reg_25;
432
433 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
434 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
435 }
436
437 /* eo0[0-3] */
438 {
439 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
440 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
441
442 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
443
444 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
445 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
446 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
447
448 /* e[0][0-3] stored in pi2_tmp[0][0-7] */
449 /* e[7][0-3] stored in pi2_tmp[0][8-15] */
450 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
451 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
452
453 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
454 pi2_scratch += 8;
455 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
456 pi2_scratch += 8;
457
458 }
459
460 /* eo0[4-7] */
461 {
462 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
463
464 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
465 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
466 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
467
468 /* e[0][4-7] stored in pi2_tmp[1][0-7] */
469 /* e[7][4-7] stored in pi2_tmp[1][8-15] */
470 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
471 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
472
473 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
474 pi2_scratch += 8;
475 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
476 pi2_scratch += 8;
477
478 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
479 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
480
481 }
482
483 /* eo1[0-3] */
484 {
485 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
486
487 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
488 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
489 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
490
491 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
492 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
493 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
494 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
495
496 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
497 pi2_scratch += 8;
498 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
499 pi2_scratch += 8;
500
501 }
502
503 /* eo1[4-7] */
504 {
505 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
506
507 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
508 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
509 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
510
511 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
512 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
513 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
514 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
515
516 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
517 pi2_scratch += 8;
518 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
519 pi2_scratch += 8;
520
521 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
522 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
523
524 }
525
526 /* eo2[0-3] */
527 {
528 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
529
530 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
531 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
532 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
533 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
534
535 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
536 pi2_scratch += 8;
537 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
538 pi2_scratch += 8;
539
540 }
541
542 /* eo2[4-7] */
543 {
544 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
545
546 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
547 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
548 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
549 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
550
551 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
552 pi2_scratch += 8;
553 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
554 pi2_scratch += 8;
555
556 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
557 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
558
559 }
560
561 /* eo3[0-3] */
562 {
563 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
564
565 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
566 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
567 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
568 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
569
570 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
571 pi2_scratch += 8;
572 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
573 pi2_scratch += 8;
574 }
575
576 /* eo3[4-7] */
577 {
578 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
579
580 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
581 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
582 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
583 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
584
585 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
586 pi2_scratch += 8;
587 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
588 pi2_scratch += 8;
589 }
590 } /* If all the rows are non-zero : Rishab */
591 else
592 {
593 /* eeo */
594 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
595 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
596
597 {
598 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
599 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
600
601 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
602 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
603
604 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
605 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
606
607 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
608 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
609 }
610
611 /* eee */
612 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
613 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
614 {
615 /* Loading coeff and src for use in next block */
616 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64
617 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
618
619 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
620 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
621
622 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
623 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
624
625 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
626 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
627
628 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
629 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
630
631 }
632 /* eo0[0-3] */
633 {
634 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
635 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
636 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
637 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
638
639 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
640 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
641
642
643 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
644 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
645 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
646
647 /* e[0][0-3] stored in pi2_tmp[0][0-7] */
648 /* e[7][0-3] stored in pi2_tmp[0][8-15] */
649 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
650 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
651 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
652 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
653
654 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
655 pi2_scratch += 8;
656 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
657 pi2_scratch += 8;
658
659
660 }
661
662 /* eo0[4-7] */
663 {
664 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
665 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
666
667 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
668 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
669 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
670
671 /* e[0][4-7] stored in pi2_tmp[1][0-7] */
672 /* e[7][4-7] stored in pi2_tmp[1][8-15] */
673 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
674 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
675 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
676 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
677
678 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
679 pi2_scratch += 8;
680 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
681 pi2_scratch += 8;
682
683 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
684 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
685
686 }
687
688 /* eo1[0-3] */
689 {
690 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
691 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
692
693 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
694 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
695 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
696
697 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
698 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
699 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
700 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
701 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
702 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
703
704 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
705 pi2_scratch += 8;
706 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
707 pi2_scratch += 8;
708
709 }
710
711 /* eo1[4-7] */
712 {
713 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
714 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
715
716 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
717 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
718 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
719
720 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
721 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
722 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
723 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
724 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
725 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
726
727 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
728 pi2_scratch += 8;
729 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
730 pi2_scratch += 8;
731 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
732 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
733 }
734
735 /* eo2[0-3] */
736 {
737 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
738 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
739
740 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
741 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
742 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
743 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
744 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
745 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
746
747 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
748 pi2_scratch += 8;
749 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
750 pi2_scratch += 8;
751 }
752
753 /* eo2[4-7] */
754 {
755 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
756 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
757
758 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
759 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
760 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
761 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
762 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
763 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
764
765 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
766 pi2_scratch += 8;
767 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
768 pi2_scratch += 8;
769
770 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
771 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
772
773 }
774
775 /* eo3[0-3] */
776 {
777 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
778 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
779
780 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
781 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
782 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
783 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
784 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
785 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
786
787
788 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
789 pi2_scratch += 8;
790 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
791 pi2_scratch += 8;
792 }
793
794 /* eo3[4-7] */
795 {
796 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
797 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
798
799 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
800 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
801 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
802 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
803 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
804 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
805
806 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
807 pi2_scratch += 8;
808 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
809 pi2_scratch += 8;
810 }
811
812 }
813 }
814
815 {
816 WORD32 sample_half_index = i << 3;
817 WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd;
818
819 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
820 pi2_tmp_src += (src_strd << 1);
821 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
822 pi2_tmp_src += (src_strd << 1);
823 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
824 pi2_tmp_src += (src_strd << 1);
825 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
826 pi2_tmp_src += (src_strd << 1);
827 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
828 pi2_tmp_src += (src_strd << 1);
829 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
830 pi2_tmp_src += (src_strd << 1);
831 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
832 pi2_tmp_src += (src_strd << 1);
833 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
834 pi2_tmp_src += (src_strd << 1);
835 }
836
837 /* o & stage 1 out */
838 {
839 WORD32 j;
840 WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
841 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
842 WORD32 out_stride = (trans_size << 1);
843 WORD32 in_stride = trans_size << 1;
844
845 if(zero_last12_rows_stg1)
846 {
847 for(j = 0; j < 2; j++)
848 {
849 if(j) //H8B= higher 8 bytes L8B lower 8 bytes
850 {
851 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
852 }
853 else
854 {
855 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
856 }
857 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
858
859
860 /* o0[0-3] */
861 {
862 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
863
864
865 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
866 pi2_src_scratch += in_stride;
867
868 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
869
870
871 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
872 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
873
874 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
875 m_count = _mm_cvtsi32_si128(i4_shift);
876 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
877
878 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
879 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
880 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
881 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
882
883 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
884
885 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
886 pi2_dst_scratch += out_stride;
887 }
888
889 /* o1[0-3] */
890 {
891 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
892
893
894 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
895 pi2_src_scratch += in_stride;
896
897 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
898
899 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
900 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
901
902 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
903 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
904 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
905 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
906
907 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
908
909 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
910 pi2_dst_scratch += out_stride;
911 }
912
913 /* o2[0-3] */
914 {
915 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
916
917 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
918 pi2_src_scratch += in_stride;
919
920 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
921
922 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
923 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
924
925 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
926 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
927 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
928 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
929
930 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
931
932 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
933 pi2_dst_scratch += out_stride;
934 }
935
936 /* o3[0-3] */
937 {
938 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
939
940 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
941 pi2_src_scratch += 8;
942
943 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
944
945 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
946 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
947
948 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
949 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
950 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
951 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
952
953 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
954
955 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
956 pi2_dst_scratch += 8;
957 }
958
959 /* o4[0-3] */
960 {
961 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
962
963 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
964 pi2_src_scratch -= in_stride;
965
966 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
967
968 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
969 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
970
971 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
972 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
973 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
974 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
975
976 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
977
978 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
979 pi2_dst_scratch -= out_stride;
980 }
981
982 /* o5[0-3] */
983 {
984 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
985
986
987 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
988 pi2_src_scratch -= in_stride;
989
990 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
991
992 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
993 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
994
995 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
996 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
997 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
998 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
999
1000 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1001
1002 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1003 pi2_dst_scratch -= out_stride;
1004 }
1005
1006 /* o6[0-3] */
1007 {
1008 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1009
1010 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1011 pi2_src_scratch -= in_stride;
1012
1013 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1014
1015 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1016 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1017
1018 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1019 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1020 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1021 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1022
1023 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1024
1025 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1026 pi2_dst_scratch -= out_stride;
1027 }
1028
1029 /* o7[0-3] */
1030 {
1031 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1032
1033 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1034 pi2_src_scratch += 8;
1035
1036 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1037 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1038
1039 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1040 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1041 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1042 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1043
1044 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1045
1046 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1047 pi2_dst_scratch += 8;
1048 }
1049 }
1050 }
1051 else if(zero_last8_rows_stg1)
1052 {
1053 for(j = 0; j < 2; j++)
1054 {
1055 if(j)
1056 {
1057 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
1058 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
1059 }
1060 else
1061 {
1062 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
1063 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
1064 }
1065 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
1066 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
1067
1068 /* o0[0-3] */
1069 {
1070 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1071 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1072
1073 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1074 pi2_src_scratch += in_stride;
1075
1076 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
1077 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
1078
1079 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1080 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1081 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1082
1083
1084 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1085 m_count = _mm_cvtsi32_si128(i4_shift);
1086
1087 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
1088
1089 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1090 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1091 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1092 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1093
1094 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1095
1096 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1097 pi2_dst_scratch += out_stride;
1098 }
1099
1100 /* o1[0-3] */
1101 {
1102 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1103 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1104
1105 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1106 pi2_src_scratch += in_stride;
1107
1108 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
1109 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
1110
1111 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1112 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1113 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1114
1115 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1116 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1117 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1118 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1119
1120 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1121
1122 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1123 pi2_dst_scratch += out_stride;
1124 }
1125
1126 /* o2[0-3] */
1127 {
1128 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1129 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1130
1131 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1132 pi2_src_scratch += in_stride;
1133
1134 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
1135 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
1136
1137 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1138 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1139 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1140
1141 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1142 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1143 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1144 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1145
1146 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1147
1148 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1149 pi2_dst_scratch += out_stride;
1150 }
1151
1152 /* o3[0-3] */
1153 {
1154 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1155 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1156
1157 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1158 pi2_src_scratch += 8;
1159
1160 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
1161 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
1162
1163 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
1164 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1165 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1166
1167 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1168 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1169 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1170 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1171
1172 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1173
1174 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1175 pi2_dst_scratch += 8;
1176 }
1177
1178 /* o4[0-3] */
1179 {
1180 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1181 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1182
1183 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1184 pi2_src_scratch -= in_stride;
1185
1186 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
1187 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
1188
1189 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1190 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1191 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1192
1193
1194 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1195 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1196 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1197 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1198
1199 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1200
1201 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1202 pi2_dst_scratch -= out_stride;
1203 }
1204
1205 /* o5[0-3] */
1206 {
1207 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1208 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1209
1210 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1211 pi2_src_scratch -= in_stride;
1212
1213 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
1214 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
1215
1216 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1217 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1218 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1219
1220
1221 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1222 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1223 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1224 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1225
1226 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1227
1228 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1229 pi2_dst_scratch -= out_stride;
1230 }
1231
1232 /* o6[0-3] */
1233 {
1234 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1235 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1236
1237 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1238 pi2_src_scratch -= in_stride;
1239
1240 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1241 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
1242
1243 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1244 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1245 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1246
1247 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1248 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1249 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1250 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1251
1252 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1253
1254 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1255 pi2_dst_scratch -= out_stride;
1256 }
1257
1258 /* o7[0-3] */
1259 {
1260 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1261 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1262
1263 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1264 pi2_src_scratch += 8;
1265
1266 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1267 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1268 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1269
1270 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1271 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1272 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1273 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1274
1275 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1276
1277 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1278 pi2_dst_scratch += 8;
1279 }
1280 }
1281
1282 }
1283 else
1284 {
1285
1286
1287
1288 for(j = 0; j < 2; j++)
1289 {
1290 if(j) //H8B= higher 8 bytes L8B lower 8 bytes
1291 {
1292 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
1293 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
1294 m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
1295 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
1296 }
1297 else
1298 {
1299 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
1300 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
1301 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
1302 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
1303 }
1304 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
1305 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
1306 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
1307 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9
1308
1309
1310 /* o0[0-3] */
1311 {
1312 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1313 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1314 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1315 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1316
1317
1318 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1319 pi2_src_scratch += in_stride;
1320
1321 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
1322 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
1323 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
1324 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
1325
1326 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1327 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
1328 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
1329 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1330 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1331
1332 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1333 m_count = _mm_cvtsi32_si128(i4_shift);
1334 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1335 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1336
1337 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1338 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1339 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1340 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1341
1342 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1343
1344 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1345 pi2_dst_scratch += out_stride;
1346 }
1347
1348 /* o1[0-3] */
1349 {
1350 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1351 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1352 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1353 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1354
1355
1356 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1357 pi2_src_scratch += in_stride;
1358
1359 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
1360 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
1361 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
1362 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
1363
1364 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1365 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
1366 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
1367 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1368 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1369
1370 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1371 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1372 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1373 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1374
1375 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1376
1377 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1378 pi2_dst_scratch += out_stride;
1379 }
1380
1381 /* o2[0-3] */
1382 {
1383 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1384 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1385 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1386 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1387
1388 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1389 pi2_src_scratch += in_stride;
1390
1391 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
1392 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
1393 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
1394 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
1395
1396 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1397 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1398 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
1399 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1400 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1401
1402 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1403 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1404 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1405 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1406
1407 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1408
1409 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1410 pi2_dst_scratch += out_stride;
1411 }
1412
1413 /* o3[0-3] */
1414 {
1415 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1416 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1417 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1418 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1419
1420
1421 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1422 pi2_src_scratch += 8;
1423
1424 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
1425 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
1426 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
1427 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
1428
1429 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
1430 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
1431 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
1432 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1433 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1434
1435 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1436 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1437 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1438 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1439
1440 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1441
1442 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1443 pi2_dst_scratch += 8;
1444 }
1445
1446 /* o4[0-3] */
1447 {
1448 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1449 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1450 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1451 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1452
1453 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1454 pi2_src_scratch -= in_stride;
1455
1456 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
1457 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
1458 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
1459 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
1460
1461 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
1462 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1463 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
1464 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1465 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1466
1467 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1468 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1469 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1470 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1471
1472 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1473
1474 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1475 pi2_dst_scratch -= out_stride;
1476 }
1477
1478 /* o5[0-3] */
1479 {
1480 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1481 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1482 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1483 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1484
1485
1486 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1487 pi2_src_scratch -= in_stride;
1488
1489 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
1490 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
1491 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
1492 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
1493
1494 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1495 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
1496 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
1497 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1498 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1499
1500 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1501 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1502 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1503 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1504
1505 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1506
1507 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1508 pi2_dst_scratch -= out_stride;
1509 }
1510
1511 /* o6[0-3] */
1512 {
1513 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1514 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1515 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1516 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1517
1518
1519 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1520 pi2_src_scratch -= in_stride;
1521
1522 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
1523 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
1524 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
1525 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
1526
1527
1528 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
1529 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
1530 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
1531 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1532 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1533
1534 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1535 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1536 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1537 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1538
1539 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1540
1541 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1542 pi2_dst_scratch -= out_stride;
1543 }
1544
1545 /* o7[0-3] */
1546 {
1547 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
1548 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
1549 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
1550 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
1551
1552 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1553 pi2_src_scratch += 8;
1554
1555 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
1556 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
1557 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
1558 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
1559 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
1560
1561
1562 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1563 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1564 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1565 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1566
1567 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1568
1569 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1570 pi2_dst_scratch += 8;
1571 }
1572 }
1573 }
1574 }
1575
1576 /* Transpose */
1577 {
1578 WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp;
1579 WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp);
1580 WORD32 out_stride = (trans_size << 1);
1581 WORD32 in_stride = (trans_size << 1);
1582 WORD32 j;
1583
1584 for(j = 0; j < 2; j++)
1585 {
1586 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
1587 pi2_src_scratch += in_stride;
1588 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
1589 pi2_src_scratch += in_stride;
1590 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
1591 pi2_src_scratch += in_stride;
1592 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
1593 pi2_src_scratch += 8;
1594 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
1595 pi2_src_scratch -= in_stride;
1596 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
1597 pi2_src_scratch -= in_stride;
1598 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
1599 pi2_src_scratch -= in_stride;
1600 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
1601 pi2_src_scratch += 8;
1602
1603 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
1604 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
1605
1606 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
1607 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
1608
1609 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
1610 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
1611
1612 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
1613 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
1614
1615
1616 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
1617 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
1618
1619 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
1620 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
1621
1622 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
1623 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
1624
1625 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
1626 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
1627
1628
1629 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
1630 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
1631
1632 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
1633 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
1634
1635 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
1636 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1
1637
1638 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2
1639 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3
1640
1641 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1642 pi2_dst_scratch += out_stride;
1643 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44);
1644 pi2_dst_scratch += out_stride;
1645 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41);
1646 pi2_dst_scratch += out_stride;
1647 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45);
1648 pi2_dst_scratch += 8;
1649 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42);
1650 pi2_dst_scratch -= out_stride;
1651 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46);
1652 pi2_dst_scratch -= out_stride;
1653 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43);
1654 pi2_dst_scratch -= out_stride;
1655 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47);
1656 pi2_dst_scratch += 8;
1657 }
1658 }
1659 }
1660
1661 if(zero_last8_cols_stg1)
1662 {
1663 WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size);
1664 WORD32 out_stride = (trans_size << 1);
1665 WORD32 j;
1666
1667 m_temp_reg_40 = _mm_setzero_si128();
1668 for(j = 0; j < 2; j++)
1669 {
1670 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1671 pi2_dst_scratch += out_stride;
1672 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1673 pi2_dst_scratch += out_stride;
1674 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1675 pi2_dst_scratch += out_stride;
1676 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1677 pi2_dst_scratch += 8;
1678 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1679 pi2_dst_scratch -= out_stride;
1680 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1681 pi2_dst_scratch -= out_stride;
1682 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1683 pi2_dst_scratch -= out_stride;
1684 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40);
1685 pi2_dst_scratch += 8;
1686 }
1687 }
1688
1689
1690
1691
1692 /* Stage 2 */
1693 for(i = 0; i < 2; i++)
1694 {
1695 //__m128i m_temp_reg_15,m_temp_reg_16;
1696 WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp);
1697 WORD32 stride = (trans_size);
1698 WORD16 temp_array[256];
1699
1700 i4_shift = IT_SHIFT_STAGE_2;
1701
1702 if(zero_last12_rows_stg2)
1703 {
1704 /* eeo */
1705 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1706 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1707 {
1708 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
1709
1710 pi2_src_temp += (stride * 9);
1711
1712 if(!i)
1713 {
1714 pi2_src_temp += (stride * 6 + 8);
1715 }
1716 else
1717 {
1718 pi2_src_temp += (stride * 2 + 8);
1719 }
1720
1721 pi2_src_temp -= (stride * 9);
1722
1723 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
1724
1725 m_temp_reg_20 = _mm_setzero_si128();
1726 m_temp_reg_22 = _mm_setzero_si128();
1727
1728 m_temp_reg_21 = _mm_setzero_si128();
1729 m_temp_reg_23 = _mm_setzero_si128();
1730 }
1731
1732 /* eee */
1733 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
1734 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
1735 {
1736 /* Loading coeff and src for use in next block */
1737
1738 /* Loading coeff and src for use in next block */
1739 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70);
1740
1741 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
1742
1743 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
1744
1745 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
1746 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
1747
1748 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
1749 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
1750
1751 m_temp_reg_26 = m_temp_reg_24;
1752 m_temp_reg_27 = m_temp_reg_25;
1753 /* */
1754
1755 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20);
1756 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20);
1757 }
1758
1759 /* eo */
1760 {
1761 WORD16 *pi2_scratch = temp_array;
1762 WORD32 out_stride = 8;
1763
1764
1765 /* eo0[0-3] */
1766 {
1767 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1768
1769 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1770
1771 /* e[0][0-3] stored in pu1_dst[0] */
1772 /* e[7][0-3] stored in pu1_dst[1] */
1773 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
1774 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
1775
1776 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1777 pi2_scratch += out_stride;
1778 _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
1779 pi2_scratch += out_stride;
1780 }
1781
1782 /* eo0[4-7] */
1783 {
1784 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1785
1786 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1787
1788 /* e[0][4-7] stored in pu1_dst[2] */
1789 /* e[7][4-7] stored in pu1_dst[3] */
1790
1791 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
1792 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
1793
1794 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1795 pi2_scratch += out_stride;
1796 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1797 pi2_scratch += out_stride;
1798
1799 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
1800 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
1801
1802 }
1803
1804 /* eo1[0-3] */
1805 {
1806 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1807
1808 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
1809
1810 /* e[1][0-3] stored in pu1_dst[4] */
1811 /* e[6][0-3] stored in pu1_dst[5] */
1812 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
1813 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
1814
1815 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1816 pi2_scratch += out_stride;
1817 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1818 pi2_scratch += out_stride;
1819 }
1820
1821 /* eo1[4-7] */
1822 {
1823 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1824
1825 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
1826
1827 /* e[1][4-7] stored in pu1_dst[6]*/
1828 /* e[6][4-7] stored in pu1_dst[7] */
1829 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
1830 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
1831
1832 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1833 pi2_scratch += out_stride;
1834 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1835 pi2_scratch += out_stride;
1836
1837 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
1838
1839 }
1840
1841 /* eo2[0-3] */
1842 {
1843 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1844
1845 /* e[2][0-3] stored in pu1_dst[8]*/
1846 /* e[5][0-3] stored in pu1_dst[9] */
1847 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30);
1848 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30);
1849
1850 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1851 pi2_scratch += out_stride;
1852 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1853 pi2_scratch += out_stride;
1854 }
1855
1856 /* eo2[4-7] */
1857 {
1858 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1859
1860 /* e[2][4-7] stored in pu1_dst[10]*/
1861 /* e[5][4-7] stored in pu1_dst[11] */
1862 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31);
1863 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31);
1864
1865 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1866 pi2_scratch += out_stride;
1867 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1868 pi2_scratch += out_stride;
1869
1870 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
1871 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
1872
1873 }
1874
1875 /* eo3[0-3] */
1876 {
1877 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1878
1879 /* e[3][0-3] stored in pu1_dst[12]*/
1880 /* e[4][0-3] stored in pu1_dst[13] */
1881 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30);
1882 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30);
1883
1884 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1885 pi2_scratch += out_stride;
1886 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1887 pi2_scratch += out_stride;
1888 }
1889
1890 /* eo3[4-7] */
1891 {
1892 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1893
1894 /* e[3][4-7] stored in pu1_dst[14]*/
1895 /* e[4][4-7] stored in pu1_dst[15] */
1896 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31);
1897 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31);
1898
1899 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1900 pi2_scratch += out_stride;
1901 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1902 pi2_scratch += out_stride;
1903 }
1904
1905 }
1906 }
1907 else if(zero_last8_rows_stg2)
1908 {
1909 /* eeo */
1910 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1911 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1912 {
1913
1914 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83
1915 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36
1916
1917 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
1918 pi2_src_temp += (stride);
1919 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
1920 pi2_src_temp += (stride * 8);
1921
1922 if(!i)
1923 {
1924 pi2_src_temp += (stride * 6 + 8);
1925 }
1926 else
1927 {
1928 pi2_src_temp += (stride * 2 + 8);
1929 }
1930
1931 pi2_src_temp -= (stride * 8);
1932 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
1933 pi2_src_temp -= (stride);
1934 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
1935
1936
1937 m_temp_reg_76 = _mm_setzero_si128();
1938
1939
1940 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
1941 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
1942
1943 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
1944 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
1945
1946 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1947 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1948
1949 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1950 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1951 }
1952
1953 /* eee */
1954 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
1955 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
1956 {
1957 /* Loading coeff and src for use in next block */
1958
1959
1960 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70);
1961
1962 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0
1963
1964 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6);
1965 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77);
1966 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6);
1967
1968 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
1969
1970 m_temp_reg_26 = m_temp_reg_24;
1971 m_temp_reg_27 = m_temp_reg_25;
1972
1973 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1974 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1975 }
1976
1977 /* eo */
1978 {
1979 WORD16 *pi2_scratch = temp_array;
1980 WORD32 out_stride = 8;
1981
1982
1983 /* eo0[0-3] */
1984 {
1985 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1986
1987 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
1988 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
1989 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
1990
1991 /* e[0][0-3] stored in pu1_dst[0] */
1992 /* e[7][0-3] stored in pu1_dst[1] */
1993 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1994 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1995
1996 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1997 pi2_scratch += out_stride;
1998 _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35);
1999 pi2_scratch += out_stride;
2000 }
2001
2002 /* eo0[4-7] */
2003 {
2004 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
2005
2006 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2007 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
2008 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
2009
2010 /* e[0][4-7] stored in pu1_dst[2] */
2011 /* e[7][4-7] stored in pu1_dst[3] */
2012
2013 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2014 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2015
2016 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2017 pi2_scratch += out_stride;
2018 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2019 pi2_scratch += out_stride;
2020
2021 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2022 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
2023
2024 }
2025
2026 /* eo1[0-3] */
2027 {
2028 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2029
2030 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2031 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
2032 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
2033
2034 /* e[1][0-3] stored in pu1_dst[4] */
2035 /* e[6][0-3] stored in pu1_dst[5] */
2036 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
2037 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
2038
2039 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2040 pi2_scratch += out_stride;
2041 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2042 pi2_scratch += out_stride;
2043 }
2044
2045 /* eo1[4-7] */
2046 {
2047 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
2048
2049 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2050 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
2051 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
2052
2053 /* e[1][4-7] stored in pu1_dst[6]*/
2054 /* e[6][4-7] stored in pu1_dst[7] */
2055 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
2056 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
2057
2058 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2059 pi2_scratch += out_stride;
2060 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2061 pi2_scratch += out_stride;
2062
2063 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
2064
2065 }
2066
2067 /* eo2[0-3] */
2068 {
2069 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2070
2071 /* e[2][0-3] stored in pu1_dst[8]*/
2072 /* e[5][0-3] stored in pu1_dst[9] */
2073 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
2074 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
2075
2076 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2077 pi2_scratch += out_stride;
2078 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2079 pi2_scratch += out_stride;
2080 }
2081
2082 /* eo2[4-7] */
2083 {
2084 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
2085
2086 /* e[2][4-7] stored in pu1_dst[10]*/
2087 /* e[5][4-7] stored in pu1_dst[11] */
2088 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
2089 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
2090
2091 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2092 pi2_scratch += out_stride;
2093 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2094 pi2_scratch += out_stride;
2095
2096 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
2097 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
2098
2099 }
2100
2101 /* eo3[0-3] */
2102 {
2103 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2104
2105 /* e[3][0-3] stored in pu1_dst[12]*/
2106 /* e[4][0-3] stored in pu1_dst[13] */
2107 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
2108 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
2109
2110 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2111 pi2_scratch += out_stride;
2112 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2113 pi2_scratch += out_stride;
2114 }
2115
2116 /* eo3[4-7] */
2117 {
2118 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
2119
2120 /* e[3][4-7] stored in pu1_dst[14]*/
2121 /* e[4][4-7] stored in pu1_dst[15] */
2122 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
2123 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
2124
2125 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2126 pi2_scratch += out_stride;
2127 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2128 pi2_scratch += out_stride;
2129 }
2130 }
2131 }
2132
2133 else
2134 {
2135 /* eeo */
2136 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
2137 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
2138 {
2139
2140
2141 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0
2142 pi2_src_temp += (stride);
2143 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4
2144 pi2_src_temp += (stride * 7);
2145 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8
2146 pi2_src_temp += (stride);
2147 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12
2148 if(!i)
2149 {
2150 pi2_src_temp += (stride * 6 + 8);
2151 }
2152 else
2153 {
2154 pi2_src_temp += (stride * 2 + 8);
2155 }
2156 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14
2157 pi2_src_temp -= (stride);
2158 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10
2159 pi2_src_temp -= (stride * 7);
2160 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6
2161 pi2_src_temp -= (stride);
2162 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2
2163
2164 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36
2165 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83
2166
2167 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's
2168 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's
2169
2170 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
2171 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
2172
2173 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
2174 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
2175
2176
2177 }
2178
2179 /* eee */
2180 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */
2181 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */
2182 {
2183 /* Loading coeff and src for use in next block */
2184 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64
2185 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64
2186
2187 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's
2188 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's
2189
2190 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
2191 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4);
2192
2193 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
2194 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);
2195
2196 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75
2197 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18
2198
2199 }
2200
2201 /* eo */
2202 {
2203 WORD16 *pi2_scratch = temp_array;
2204 WORD32 out_stride = 8;
2205
2206
2207
2208 /* eo0[0-3] */
2209 {
2210 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
2211 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
2212 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
2213 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
2214
2215
2216 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2217 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
2218
2219
2220 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2221 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20);
2222 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20);
2223
2224
2225 /* e[0][0-3] stored in pi2_tmp[0][0-7] */
2226 /* e[7][0-3] stored in pi2_tmp[0][8-15] */
2227 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
2228 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
2229 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2230 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2231
2232 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2233 pi2_scratch += out_stride;
2234 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2235 pi2_scratch += out_stride;
2236
2237
2238 }
2239
2240 /* eo0[4-7] */
2241 {
2242
2243 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2244 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
2245
2246 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */
2247 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21);
2248 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21);
2249
2250 /* e[0][4-7] stored in pi2_tmp[1][0-7] */
2251 /* e[7][4-7] stored in pi2_tmp[1][8-15] */
2252 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
2253 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
2254 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2255 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2256
2257 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2258 pi2_scratch += out_stride;
2259 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2260 pi2_scratch += out_stride;
2261
2262 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18
2263 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50
2264
2265 }
2266
2267 /* eo1[0-3] */
2268 {
2269 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2270 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
2271
2272 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2273 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22);
2274 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22);
2275
2276 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
2277 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
2278 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
2279 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
2280 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32);
2281 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32);
2282
2283 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2284 pi2_scratch += out_stride;
2285 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2286 pi2_scratch += out_stride;
2287
2288 }
2289
2290 /* eo1[4-7] */
2291 {
2292 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
2293 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2294
2295 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */
2296 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23);
2297 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23);
2298
2299 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
2300 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
2301 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31);
2302 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31);
2303 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33);
2304 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33);
2305
2306 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2307 pi2_scratch += out_stride;
2308 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2309 pi2_scratch += out_stride;
2310 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89
2311 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75
2312 }
2313
2314 /* eo2[0-3] */
2315 {
2316 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2317 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2);
2318
2319 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
2320 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
2321 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
2322 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
2323 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2324 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2325
2326 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2327 pi2_scratch += out_stride;
2328 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2329 pi2_scratch += out_stride;
2330 }
2331
2332 /* eo2[4-7] */
2333 {
2334 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1);
2335 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2);
2336
2337 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
2338 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
2339 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31);
2340 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31);
2341 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2342 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2343
2344 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2345 pi2_scratch += out_stride;
2346 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2347 pi2_scratch += out_stride;
2348
2349 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50
2350 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89
2351
2352 }
2353
2354 /* eo3[0-3] */
2355 {
2356 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
2357 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4);
2358
2359 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
2360 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
2361 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
2362 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
2363 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32);
2364 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32);
2365
2366
2367 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2368 pi2_scratch += out_stride;
2369 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2370 pi2_scratch += out_stride;
2371 }
2372
2373 /* eo3[4-7] */
2374 {
2375 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3);
2376 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2377
2378 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
2379 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
2380 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31);
2381 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31);
2382 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33);
2383 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33);
2384
2385 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
2386 pi2_scratch += out_stride;
2387 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
2388 pi2_scratch += out_stride;
2389 }
2390 }
2391 }
2392
2393 if(zero_last12_rows_stg2)
2394 {
2395 /* o & stage 2 pre-transposed out */
2396 {
2397 WORD32 j;
2398 WORD16 *pi2_src_scratch = temp_array;
2399 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2400 WORD32 out_stride = (trans_size);
2401 WORD32 in_stride = (8) * 4;
2402
2403 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2404
2405 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2406
2407 pi2_src_temp += (stride * 9);
2408
2409 if(0 == i)
2410 {
2411 pi2_src_temp -= (stride * 2 - 8);
2412 }
2413 else
2414 {
2415 pi2_src_temp -= (stride * 6 - 8);
2416 }
2417 pi2_src_temp -= (stride * 9);
2418
2419 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2420
2421
2422 for(j = 0; j < 2; j++)
2423 {
2424 if(j)
2425 {
2426 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2427 }
2428 else
2429 {
2430 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2431 }
2432 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2433
2434 /* o0[0-3] */
2435 {
2436 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2437
2438 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2439 pi2_src_scratch += in_stride;
2440
2441 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2442
2443 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2444 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2445
2446
2447 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2448 m_count = _mm_cvtsi32_si128(i4_shift);
2449 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2450
2451
2452 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2453 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2454 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2455 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2456
2457 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2458
2459 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2460 pi2_dst_scratch += out_stride;
2461 }
2462
2463 /* o1[0-3] */
2464 {
2465 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2466
2467 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2468 pi2_src_scratch += in_stride;
2469
2470 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
2471
2472 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2473 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2474
2475 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2476 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2477 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2478 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2479
2480 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2481
2482 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2483 pi2_dst_scratch += ((!i) * out_stride + 8);
2484 }
2485
2486 /* o2[0-3] */
2487 {
2488 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2489
2490 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2491 pi2_src_scratch += in_stride;
2492
2493 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
2494
2495 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2496 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2497
2498 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2499 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2500 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2501 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2502
2503 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2504
2505 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2506 pi2_dst_scratch += out_stride;
2507 }
2508
2509 /* o3[0-3] */
2510 {
2511 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2512
2513 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2514 pi2_src_scratch += 8;
2515
2516 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
2517
2518 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2519 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2520
2521 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2522 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2523 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2524 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2525
2526 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2527
2528 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2529 pi2_dst_scratch += (i * out_stride + 8);
2530 }
2531
2532 /* o4[0-3] */
2533 {
2534 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2535
2536 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2537 pi2_src_scratch -= in_stride;
2538
2539 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
2540
2541 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2542 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2543
2544
2545 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2546 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2547 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2548 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2549
2550 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2551
2552 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2553 pi2_dst_scratch += out_stride;
2554 }
2555
2556 /* o5[0-3] */
2557 {
2558 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2559
2560 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2561 pi2_src_scratch -= in_stride;
2562
2563 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
2564
2565 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2566 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2567
2568
2569 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2570 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2571 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2572 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2573
2574 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2575
2576 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2577 pi2_dst_scratch += ((!i) * out_stride + 8);
2578 }
2579
2580 /* o6[0-3] */
2581 {
2582 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2583
2584 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2585 pi2_src_scratch -= in_stride;
2586
2587 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
2588
2589 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2590 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2591
2592 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2593 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2594 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2595 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2596
2597 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2598
2599 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2600 pi2_dst_scratch += out_stride;
2601 }
2602
2603 /* o7[0-3] */
2604 {
2605 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2606
2607 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2608 pi2_src_scratch += 8;
2609
2610 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2611 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2612
2613 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2614 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2615 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2616 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2617
2618 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2619
2620 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2621 pi2_dst_scratch += (i * out_stride + 8);
2622 }
2623
2624
2625 }
2626 }
2627 }
2628 else if(zero_last8_rows_stg2)
2629 {
2630 /* o & stage 2 pre-transposed out */
2631 {
2632 WORD32 j;
2633 WORD16 *pi2_src_scratch = temp_array;
2634 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2635 WORD32 out_stride = (trans_size);
2636 WORD32 in_stride = (8) * 4;
2637
2638 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2639
2640
2641 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2642 pi2_src_temp += (stride);
2643 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
2644 pi2_src_temp += (stride * 8);
2645
2646 if(0 == i)
2647 {
2648 pi2_src_temp -= (stride * 2 - 8);
2649 }
2650 else
2651 {
2652 pi2_src_temp -= (stride * 6 - 8);
2653 }
2654
2655 pi2_src_temp -= (stride * 8);
2656 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
2657 pi2_src_temp -= (stride);
2658 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2659
2660
2661 for(j = 0; j < 2; j++)
2662 {
2663 if(j)
2664 {
2665 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2666 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
2667 }
2668 else
2669 {
2670 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2671 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
2672 }
2673 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2674 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
2675
2676 /* o0[0-3] */
2677 {
2678 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2679 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2680
2681 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2682 pi2_src_scratch += in_stride;
2683
2684 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2685 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
2686
2687 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2688 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2689 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2690
2691
2692 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2693 m_count = _mm_cvtsi32_si128(i4_shift);
2694
2695 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2696
2697 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2698 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2699 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2700 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2701
2702 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2703
2704 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2705 pi2_dst_scratch += out_stride;
2706 }
2707
2708 /* o1[0-3] */
2709 {
2710 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2711 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2712
2713 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2714 pi2_src_scratch += in_stride;
2715
2716 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
2717 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
2718
2719 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2720 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2721 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2722
2723 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2724 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2725 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2726 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2727
2728 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2729
2730 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2731 pi2_dst_scratch += ((!i) * out_stride + 8);
2732 }
2733
2734 /* o2[0-3] */
2735 {
2736 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2737 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2738
2739 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2740 pi2_src_scratch += in_stride;
2741
2742 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
2743 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
2744
2745 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
2746 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2747 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2748
2749 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2750 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2751 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2752 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2753
2754 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2755
2756 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2757 pi2_dst_scratch += out_stride;
2758 }
2759
2760 /* o3[0-3] */
2761 {
2762 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2763 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2764
2765 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2766 pi2_src_scratch += 8;
2767
2768 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
2769 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
2770
2771 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
2772 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2773 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2774
2775 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2776 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2777 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2778 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2779
2780 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2781
2782 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2783 pi2_dst_scratch += (i * out_stride + 8);
2784 }
2785
2786 /* o4[0-3] */
2787 {
2788 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2789 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2790
2791 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2792 pi2_src_scratch -= in_stride;
2793
2794 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
2795 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
2796
2797 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
2798 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2799 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2800
2801
2802 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2803 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2804 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2805 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2806
2807 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2808
2809 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2810 pi2_dst_scratch += out_stride;
2811 }
2812
2813 /* o5[0-3] */
2814 {
2815 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2816 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2817
2818 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2819 pi2_src_scratch -= in_stride;
2820
2821 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
2822 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
2823
2824 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2825 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2826 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2827
2828
2829 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2830 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2831 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2832 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2833
2834 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2835
2836 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2837 pi2_dst_scratch += ((!i) * out_stride + 8);
2838 }
2839
2840 /* o6[0-3] */
2841 {
2842 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2843 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2844
2845 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2846 pi2_src_scratch -= in_stride;
2847
2848 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
2849 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
2850
2851 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2852 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2853 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2854
2855 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2856 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2857 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2858 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2859
2860 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2861
2862 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2863 pi2_dst_scratch += out_stride;
2864 }
2865
2866 /* o7[0-3] */
2867 {
2868 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2869 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2870
2871 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2872 pi2_src_scratch += 8;
2873
2874 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
2875 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
2876 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
2877
2878 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2879 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2880 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2881 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2882
2883 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2884
2885 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2886 pi2_dst_scratch += (i * out_stride + 8);
2887 }
2888 }
2889 }
2890 }
2891 else
2892 {
2893 /* o & stage 2 pre-transposed out */
2894 {
2895 WORD32 j;
2896 WORD16 *pi2_src_scratch = temp_array;
2897 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp);
2898 WORD32 out_stride = (trans_size);
2899 WORD32 in_stride = (8) * 4;
2900
2901 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2);
2902
2903
2904 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1
2905 pi2_src_temp += (stride);
2906 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5
2907 pi2_src_temp += (stride * 7);
2908 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9
2909 pi2_src_temp += (stride);
2910 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13
2911 if(0 == i)
2912 {
2913 pi2_src_temp -= (stride * 2 - 8);
2914 }
2915 else
2916 {
2917 pi2_src_temp -= (stride * 6 - 8);
2918 }
2919 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15
2920 pi2_src_temp -= (stride);
2921 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11
2922 pi2_src_temp -= (stride * 7);
2923 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7
2924 pi2_src_temp -= (stride);
2925 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3
2926
2927
2928 for(j = 0; j < 2; j++)
2929 {
2930
2931 if(j) //H8B= higher 8 bytes L8B lower 8 bytes
2932 {
2933 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B
2934 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B
2935 m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B
2936 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B
2937 }
2938 else
2939 {
2940 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B
2941 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B
2942 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B
2943 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B
2944 }
2945 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87
2946 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70
2947 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43
2948 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9
2949
2950
2951 /* o0[0-3] */
2952 {
2953 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2954 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2955 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2956 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2957
2958
2959 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2960 pi2_src_scratch += in_stride;
2961
2962 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57
2963 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43
2964 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90
2965 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25
2966
2967 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2968 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2969 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2970 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2971 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2972
2973 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2974 m_count = _mm_cvtsi32_si128(i4_shift);
2975 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00);
2976
2977 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2978 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2979 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2980 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2981
2982 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2983
2984 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2985 pi2_dst_scratch += out_stride;
2986 }
2987
2988 /* o1[0-3] */
2989 {
2990 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
2991 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
2992 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
2993 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
2994
2995
2996 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2997 pi2_src_scratch += in_stride;
2998
2999 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9
3000 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87
3001 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57
3002 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43
3003
3004 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3005 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
3006 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
3007 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3008 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3009
3010 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3011 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3012 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3013 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3014
3015 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3016
3017 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3018 pi2_dst_scratch += ((!i) * out_stride + 8);
3019 }
3020
3021 /* o2[0-3] */
3022 {
3023 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3024 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3025 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3026 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3027
3028 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3029 pi2_src_scratch += in_stride;
3030
3031 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43
3032 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9
3033 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25
3034 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57
3035
3036 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3037 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3038 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
3039 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3040 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3041
3042 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3043 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3044 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3045 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3046
3047 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3048
3049 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3050 pi2_dst_scratch += out_stride;
3051 }
3052
3053 /* o3[0-3] */
3054 {
3055 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3056 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3057 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3058 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3059
3060
3061 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3062 pi2_src_scratch += 8;
3063
3064 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80
3065 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90
3066 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87
3067 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70
3068
3069 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25);
3070 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
3071 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
3072 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3073 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3074
3075 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3076 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3077 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3078 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3079
3080 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3081
3082 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3083 pi2_dst_scratch += (i * out_stride + 8);
3084 }
3085
3086 /* o4[0-3] */
3087 {
3088 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3089 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3090 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3091 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3092
3093 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3094 pi2_src_scratch -= in_stride;
3095
3096 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90
3097 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25
3098 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70
3099 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80
3100
3101 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21);
3102 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3103 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22);
3104 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3105 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3106
3107 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3108 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3109 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3110 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3111
3112 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3113
3114 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3115 pi2_dst_scratch += out_stride;
3116 }
3117
3118 /* o5[0-3] */
3119 {
3120 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3121 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3122 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3123 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3124
3125
3126 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3127 pi2_src_scratch -= in_stride;
3128
3129 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70
3130 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80
3131 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9
3132 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87
3133
3134 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3135 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27);
3136 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26);
3137 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3138 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3139
3140 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3141 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3142 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3143 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3144
3145 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3146
3147 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3148 pi2_dst_scratch += ((!i) * out_stride + 8);
3149 }
3150
3151 /* o6[0-3] */
3152 {
3153 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3154 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3155 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3156 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3157
3158
3159 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3160 pi2_src_scratch -= in_stride;
3161
3162 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25
3163 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57
3164 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80
3165 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90
3166
3167
3168 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3169 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23);
3170 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3171 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3172 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3173
3174 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3175 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3176 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3177 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3178
3179 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3180
3181 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3182 pi2_dst_scratch += out_stride;
3183 }
3184
3185 /* o7[0-3] */
3186 {
3187 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
3188 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6);
3189 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7);
3190 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8);
3191
3192 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3193 pi2_src_scratch += 8;
3194
3195 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25);
3196 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27);
3197 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26);
3198 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24);
3199 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24);
3200
3201
3202 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3203 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3204 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3205 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3206
3207 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3208
3209 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3210 pi2_dst_scratch += (i * out_stride + 8);
3211 }
3212
3213 }
3214 }
3215 }
3216 }
3217
3218 /* Transpose */
3219 {
3220 WORD16 *pi2_src_scratch;
3221 UWORD8 *pu1_pred_temp = pu1_pred;
3222 WORD32 out_stride = dst_strd;
3223 WORD32 in_stride = trans_size;
3224 WORD32 j;
3225 m_temp_reg_1 = _mm_setzero_si128();
3226 for(i = 0; i < 2; i++)
3227 {
3228 pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp;
3229
3230 for(j = 0; j < 2; j++)
3231 {
3232 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a
3233 pi2_src_scratch += in_stride;
3234 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c
3235 pi2_src_scratch += ((!i) * in_stride + 8);
3236 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e
3237 pi2_src_scratch += (in_stride);
3238 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g
3239 pi2_src_scratch += (i * in_stride + 8);
3240 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i
3241 pi2_src_scratch += in_stride;
3242 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k
3243 pi2_src_scratch += ((!i) * in_stride + 8);
3244 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m
3245 pi2_src_scratch += in_stride;
3246 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o
3247 pi2_src_scratch += (i * in_stride + 8);
3248
3249 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0
3250 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0
3251
3252 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0
3253 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0
3254
3255 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0
3256 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0
3257
3258 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0
3259 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0
3260
3261
3262 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0
3263 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2
3264
3265 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0
3266 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2
3267
3268 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0
3269 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2
3270
3271 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0
3272 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2
3273
3274
3275 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0
3276 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3277
3278 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3279 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3280
3281 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
3282 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0);
3283 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12);
3284
3285 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
3286 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3287 pu1_dst += out_stride;
3288 pu1_pred_temp += pred_strd;
3289
3290 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1
3291 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3292
3293 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3294 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3295
3296 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0
3297 m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0);
3298 m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12);
3299
3300 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45);
3301 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3302 pu1_dst += out_stride;
3303 pu1_pred_temp += pred_strd;
3304
3305 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2
3306 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3307
3308 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3309 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3310
3311 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
3312 m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0);
3313 m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12);
3314
3315 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46);
3316 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3317 pu1_dst += out_stride;
3318 pu1_pred_temp += pred_strd;
3319
3320 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3
3321 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp);
3322
3323 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1);
3324 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1);
3325
3326 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0
3327 m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0);
3328 m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12);
3329
3330 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47);
3331 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
3332 pu1_dst += out_stride;
3333 pu1_pred_temp += pred_strd;
3334 }
3335 }
3336 }
3337 }
3338