1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 ******************************************************************************
23 * @file ime_distortion_metrics_sse42.c
24 *
25 * @brief
26 *  This file contains definitions of routines that compute distortion
27 *  between two macro/sub blocks of identical dimensions
28 *
29 * @author
30 *  Ittiam
31 *
32 * @par List of Functions:
33 *  - ime_compute_sad_16x16_sse42()
34 *  - ime_compute_sad_16x16_fast_sse42()
35 *  - ime_compute_sad_16x16_ea8_sse42()
36 *  - ime_compute_sad_16x8_sse42()
37 *  - ime_calculate_sad4_prog_sse42()
38 *  - ime_sub_pel_compute_sad_16x16_sse42()
39 *  - ime_compute_satqd_16x16_lumainter_sse42()
40 *
41 * @remarks
42 *  None
43 *
44 *******************************************************************************
45 */
46 
47 /*****************************************************************************/
48 /* File Includes                                                             */
49 /*****************************************************************************/
50 
51 /* System include files */
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 
56 /* User include files */
57 #include "ime_typedefs.h"
58 #include "ime_defs.h"
59 #include "ime_macros.h"
60 #include "ime_statistics.h"
61 #include "ime_platform_macros.h"
62 #include "ime_distortion_metrics.h"
63 #include <immintrin.h>
64 
65 /*****************************************************************************/
66 /* Function Definitions                                                      */
67 /*****************************************************************************/
68 
69 /**
70 ******************************************************************************
71 *
72 * @brief computes distortion (SAD) between 2 16x16 blocks
73 *
74 * @par   Description
75 *   This functions computes SAD between 2 16x16 blocks. There is a provision
76 *   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
77 *   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
78 *
79 * @param[in] pu1_src
80 *  UWORD8 pointer to the source
81 *
82 * @param[out] pu1_dst
83 *  UWORD8 pointer to the destination
84 *
85 * @param[in] src_strd
86 *  integer source stride
87 *
88 * @param[in] dst_strd
89 *  integer destination stride
90 *
91 * @param[in] i4_max_sad
92 *  integer maximum allowed distortion
93 *
94 * @param[out] pi4_mb_distortion
95 *  integer evaluated sad
96 *
97 * @remarks
98 *
99 ******************************************************************************
100 */
ime_compute_sad_16x16_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,WORD32 i4_max_sad,WORD32 * pi4_mb_distortion)101 void ime_compute_sad_16x16_sse42(UWORD8 *pu1_src,
102                            UWORD8 *pu1_est,
103                            WORD32 src_strd,
104                            WORD32 est_strd,
105                            WORD32 i4_max_sad,
106                            WORD32 *pi4_mb_distortion)
107 {
108     __m128i src_r0, src_r1, src_r2, src_r3;
109     __m128i est_r0, est_r1, est_r2, est_r3;
110     __m128i res_r0, res_r1, res_r2, res_r3;
111     __m128i sad_val;
112     int val1, val2;
113     UNUSED (i4_max_sad);
114 
115     // Row 0-3 sad calculation
116     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
117     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
118     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
119     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
120 
121     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
122     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
123     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
124     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
125 
126     res_r0 = _mm_sad_epu8(src_r0, est_r0);
127     res_r1 = _mm_sad_epu8(src_r1, est_r1);
128     res_r2 = _mm_sad_epu8(src_r2, est_r2);
129     res_r3 = _mm_sad_epu8(src_r3, est_r3);
130 
131     sad_val = _mm_add_epi64(res_r0, res_r1);
132     sad_val = _mm_add_epi64(sad_val, res_r2);
133     sad_val = _mm_add_epi64(sad_val, res_r3);
134 
135     // Row 4-7 sad calculation
136     pu1_src += 4*src_strd;
137     pu1_est += 4*est_strd;
138 
139     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
140     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
141     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
142     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
143 
144     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
145     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
146     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
147     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
148 
149     res_r0 = _mm_sad_epu8(src_r0, est_r0);
150     res_r1 = _mm_sad_epu8(src_r1, est_r1);
151     res_r2 = _mm_sad_epu8(src_r2, est_r2);
152     res_r3 = _mm_sad_epu8(src_r3, est_r3);
153 
154     sad_val = _mm_add_epi64(sad_val, res_r0);
155     sad_val = _mm_add_epi64(sad_val, res_r1);
156     sad_val = _mm_add_epi64(sad_val, res_r2);
157     sad_val = _mm_add_epi64(sad_val, res_r3);
158 
159     // Row 8-11 sad calculation
160     pu1_src += 4*src_strd;
161     pu1_est += 4*est_strd;
162     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
163     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
164     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
165     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
166 
167     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
168     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
169     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
170     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
171 
172     res_r0 = _mm_sad_epu8(src_r0, est_r0);
173     res_r1 = _mm_sad_epu8(src_r1, est_r1);
174     res_r2 = _mm_sad_epu8(src_r2, est_r2);
175     res_r3 = _mm_sad_epu8(src_r3, est_r3);
176 
177     sad_val = _mm_add_epi64(sad_val, res_r0);
178     sad_val = _mm_add_epi64(sad_val, res_r1);
179     sad_val = _mm_add_epi64(sad_val, res_r2);
180     sad_val = _mm_add_epi64(sad_val, res_r3);
181 
182     // Row 12-15 sad calculation
183     pu1_src += 4*src_strd;
184     pu1_est += 4*est_strd;
185     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
186     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
187     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
188     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
189 
190     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
191     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
192     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
193     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
194 
195     res_r0 = _mm_sad_epu8(src_r0, est_r0);
196     res_r1 = _mm_sad_epu8(src_r1, est_r1);
197     res_r2 = _mm_sad_epu8(src_r2, est_r2);
198     res_r3 = _mm_sad_epu8(src_r3, est_r3);
199 
200     sad_val = _mm_add_epi64(sad_val, res_r0);
201     sad_val = _mm_add_epi64(sad_val, res_r1);
202     sad_val = _mm_add_epi64(sad_val, res_r2);
203     sad_val = _mm_add_epi64(sad_val, res_r3);
204 
205     val1 = _mm_extract_epi32(sad_val,0);
206     val2 = _mm_extract_epi32(sad_val, 2);
207     *pi4_mb_distortion = (val1+val2);
208 
209     return;
210 }
211 
212 /**
213 ******************************************************************************
214 *
215 *  @brief computes distortion (SAD) between 2 16x8  blocks
216 *
217 *
218 *  @par   Description
219 *   This functions computes SAD between 2 16x8 blocks. There is a provision
220 *   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
221 *   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
222 *
223 * @param[in] pu1_src
224 *  UWORD8 pointer to the source
225 *
226 * @param[out] pu1_dst
227 *  UWORD8 pointer to the destination
228 *
229 * @param[in] src_strd
230 *  integer source stride
231 *
232 * @param[in] dst_strd
233 *  integer destination stride
234 *
235 * @param[in] u4_max_sad
236 *  integer maximum allowed distortion
237 *
238 * @param[out] pi4_mb_distortion
239 *  integer evaluated sad
240 *
241 * @remarks
242 *
243 ******************************************************************************
244 */
ime_compute_sad_16x8_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,WORD32 i4_max_sad,WORD32 * pi4_mb_distortion)245 void ime_compute_sad_16x8_sse42(UWORD8 *pu1_src,
246                     UWORD8 *pu1_est,
247                     WORD32 src_strd,
248                     WORD32 est_strd,
249                     WORD32 i4_max_sad,
250                     WORD32 *pi4_mb_distortion)
251 {
252     __m128i src_r0, src_r1, src_r2, src_r3;
253     __m128i est_r0, est_r1, est_r2, est_r3;
254     __m128i res_r0, res_r1, res_r2, res_r3;
255     __m128i sad_val;
256     int val1, val2;
257     UNUSED (i4_max_sad);
258 
259     // Row 0-3 sad calculation
260     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
261     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
262     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
263     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
264 
265     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
266     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
267     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
268     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
269 
270     res_r0 = _mm_sad_epu8(src_r0, est_r0);
271     res_r1 = _mm_sad_epu8(src_r1, est_r1);
272     res_r2 = _mm_sad_epu8(src_r2, est_r2);
273     res_r3 = _mm_sad_epu8(src_r3, est_r3);
274 
275     sad_val = _mm_add_epi64(res_r0, res_r1);
276     sad_val = _mm_add_epi64(sad_val, res_r2);
277     sad_val = _mm_add_epi64(sad_val, res_r3);
278 
279     // Row 4-7 sad calculation
280     pu1_src += 4*src_strd;
281     pu1_est += 4*est_strd;
282 
283     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
284     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + src_strd));
285     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
286     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 3*src_strd));
287 
288     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
289     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + est_strd));
290     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
291     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 3*est_strd));
292 
293     res_r0 = _mm_sad_epu8(src_r0, est_r0);
294     res_r1 = _mm_sad_epu8(src_r1, est_r1);
295     res_r2 = _mm_sad_epu8(src_r2, est_r2);
296     res_r3 = _mm_sad_epu8(src_r3, est_r3);
297 
298     sad_val = _mm_add_epi64(sad_val, res_r0);
299     sad_val = _mm_add_epi64(sad_val, res_r1);
300     sad_val = _mm_add_epi64(sad_val, res_r2);
301     sad_val = _mm_add_epi64(sad_val, res_r3);
302 
303     val1 = _mm_extract_epi32(sad_val,0);
304     val2 = _mm_extract_epi32(sad_val, 2);
305     *pi4_mb_distortion = (val1+val2);
306     return;
307 }
308 
309 /**
310 ******************************************************************************
311 *
312 * @brief computes distortion (SAD) between 2 16x16 blocks
313 *
314 * @par   Description
315 *   This functions computes SAD between 2 16x16 blocks. There is a provision
316 *   for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
317 *   compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
318 *
319 * @param[in] pu1_src
320 *  UWORD8 pointer to the source
321 *
322 * @param[out] pu1_dst
323 *  UWORD8 pointer to the destination
324 *
325 * @param[in] src_strd
326 *  integer source stride
327 *
328 * @param[in] dst_strd
329 *  integer destination stride
330 *
331 * @param[in] i4_max_sad
332 *  integer maximum allowed distortion
333 *
334 * @param[out] pi4_mb_distortion
335 *  integer evaluated sad
336 *
337 * @remarks
338 *
339 ******************************************************************************
340 */
ime_compute_sad_16x16_ea8_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,WORD32 i4_max_sad,WORD32 * pi4_mb_distortion)341 void ime_compute_sad_16x16_ea8_sse42(UWORD8 *pu1_src,
342                                UWORD8 *pu1_est,
343                                WORD32 src_strd,
344                                WORD32 est_strd,
345                                WORD32 i4_max_sad,
346                                WORD32 *pi4_mb_distortion)
347 {
348     __m128i src_r0, src_r1, src_r2, src_r3;
349     __m128i est_r0, est_r1, est_r2, est_r3;
350     __m128i res_r0, res_r1, res_r2, res_r3;
351     __m128i sad_val;
352     WORD32 val1, val2;
353     WORD32 i4_sad;
354     UWORD8 *pu1_src_temp = pu1_src + src_strd;
355     UWORD8 *pu1_est_temp = pu1_est + est_strd;
356 
357     // Row 0,2,4,6 sad calculation
358     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
359     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
360     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
361     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
362 
363     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
364     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
365     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
366     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
367 
368     res_r0 = _mm_sad_epu8(src_r0, est_r0);
369     res_r1 = _mm_sad_epu8(src_r1, est_r1);
370     res_r2 = _mm_sad_epu8(src_r2, est_r2);
371     res_r3 = _mm_sad_epu8(src_r3, est_r3);
372 
373     sad_val = _mm_add_epi64(res_r0, res_r1);
374     sad_val = _mm_add_epi64(sad_val, res_r2);
375     sad_val = _mm_add_epi64(sad_val, res_r3);
376 
377     // Row 8,10,12,14 sad calculation
378     pu1_src += 8*src_strd;
379     pu1_est += 8*est_strd;
380 
381     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
382     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
383     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
384     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
385 
386     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
387     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
388     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
389     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
390 
391     res_r0 = _mm_sad_epu8(src_r0, est_r0);
392     res_r1 = _mm_sad_epu8(src_r1, est_r1);
393     res_r2 = _mm_sad_epu8(src_r2, est_r2);
394     res_r3 = _mm_sad_epu8(src_r3, est_r3);
395 
396     sad_val = _mm_add_epi64(sad_val, res_r0);
397     sad_val = _mm_add_epi64(sad_val, res_r1);
398     sad_val = _mm_add_epi64(sad_val, res_r2);
399     sad_val = _mm_add_epi64(sad_val, res_r3);
400 
401     pu1_src = pu1_src_temp;
402     pu1_est = pu1_est_temp;
403 
404     val1 = _mm_extract_epi32(sad_val, 0);
405     val2 = _mm_extract_epi32(sad_val, 2);
406 
407     i4_sad = val1 + val2;
408     if (i4_max_sad < i4_sad)
409     {
410         *pi4_mb_distortion = i4_sad;
411         return ;
412     }
413     // Row 1,3,5,7 sad calculation
414     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
415     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
416     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
417     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
418 
419     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
420     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
421     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
422     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
423 
424     res_r0 = _mm_sad_epu8(src_r0, est_r0);
425     res_r1 = _mm_sad_epu8(src_r1, est_r1);
426     res_r2 = _mm_sad_epu8(src_r2, est_r2);
427     res_r3 = _mm_sad_epu8(src_r3, est_r3);
428 
429     sad_val = _mm_add_epi64(sad_val, res_r0);
430     sad_val = _mm_add_epi64(sad_val, res_r1);
431     sad_val = _mm_add_epi64(sad_val, res_r2);
432     sad_val = _mm_add_epi64(sad_val, res_r3);
433 
434     // Row 9,11,13,15 sad calculation
435     pu1_src += 8*src_strd;
436     pu1_est += 8*est_strd;
437     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
438     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2*src_strd));
439     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4*src_strd));
440     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6*src_strd));
441 
442     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
443     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2*est_strd));
444     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4*est_strd));
445     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6*est_strd));
446 
447     res_r0 = _mm_sad_epu8(src_r0, est_r0);
448     res_r1 = _mm_sad_epu8(src_r1, est_r1);
449     res_r2 = _mm_sad_epu8(src_r2, est_r2);
450     res_r3 = _mm_sad_epu8(src_r3, est_r3);
451 
452     sad_val = _mm_add_epi64(sad_val, res_r0);
453     sad_val = _mm_add_epi64(sad_val, res_r1);
454     sad_val = _mm_add_epi64(sad_val, res_r2);
455     sad_val = _mm_add_epi64(sad_val, res_r3);
456 
457     val1 = _mm_extract_epi32(sad_val, 0);
458     val2 = _mm_extract_epi32(sad_val, 2);
459     *pi4_mb_distortion = (val1+val2);
460 
461     return;
462 }
463 
464 /**
465 ******************************************************************************
466 *
467 * @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
468 *
469 * @par   Description
470 *   This functions computes SAD between 2 16x16 blocks by processing alternate
471 *   rows (fast mode). For fast mode it is assumed sad obtained by processing
472 *   alternate rows is approximately twice as that for the whole block.
473 *
474 * @param[in] pu1_src
475 *  UWORD8 pointer to the source
476 *
477 * @param[out] pu1_dst
478 *  UWORD8 pointer to the destination
479 *
480 * @param[in] src_strd
481 *  integer source stride
482 *
483 * @param[in] dst_strd
484 *  integer destination stride
485 *
486 * @param[in] i4_max_sad
487 *  integer maximum allowed distortion
488 *
489 * @param[out] pi4_mb_distortion
490 *  integer evaluated sad
491 *
492 * @remarks
493 *
494 ******************************************************************************
495 */
ime_compute_sad_16x16_fast_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,WORD32 i4_max_sad,WORD32 * pi4_mb_distortion)496 void ime_compute_sad_16x16_fast_sse42(UWORD8 *pu1_src,
497                                 UWORD8 *pu1_est,
498                                 WORD32 src_strd,
499                                 WORD32 est_strd,
500                                 WORD32 i4_max_sad,
501                                 WORD32 *pi4_mb_distortion)
502 {
503     __m128i src_r0, src_r1, src_r2, src_r3;
504     __m128i est_r0, est_r1, est_r2, est_r3;
505     __m128i res_r0, res_r1, res_r2, res_r3;
506     __m128i sad_val;
507     WORD32 val1, val2;
508     WORD32 i4_sad;
509     UWORD8 *pu1_src_temp = pu1_src + src_strd;
510     UWORD8 *pu1_est_temp = pu1_est + est_strd;
511     UNUSED (i4_max_sad);
512 
513     // Row 0,2,4,6 sad calculation
514     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
515     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
516     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
517     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
518 
519     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
520     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
521     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
522     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
523 
524     res_r0 = _mm_sad_epu8(src_r0, est_r0);
525     res_r1 = _mm_sad_epu8(src_r1, est_r1);
526     res_r2 = _mm_sad_epu8(src_r2, est_r2);
527     res_r3 = _mm_sad_epu8(src_r3, est_r3);
528 
529     sad_val = _mm_add_epi64(res_r0, res_r1);
530     sad_val = _mm_add_epi64(sad_val, res_r2);
531     sad_val = _mm_add_epi64(sad_val, res_r3);
532 
533     // Row 8,10,12,14 sad calculation
534     pu1_src += 8 * src_strd;
535     pu1_est += 8 * est_strd;
536 
537     src_r0 = _mm_loadu_si128((__m128i *) (pu1_src));
538     src_r1 = _mm_loadu_si128((__m128i *) (pu1_src + 2 * src_strd));
539     src_r2 = _mm_loadu_si128((__m128i *) (pu1_src + 4 * src_strd));
540     src_r3 = _mm_loadu_si128((__m128i *) (pu1_src + 6 * src_strd));
541 
542     est_r0 = _mm_loadu_si128((__m128i *) (pu1_est));
543     est_r1 = _mm_loadu_si128((__m128i *) (pu1_est + 2 * est_strd));
544     est_r2 = _mm_loadu_si128((__m128i *) (pu1_est + 4 * est_strd));
545     est_r3 = _mm_loadu_si128((__m128i *) (pu1_est + 6 * est_strd));
546 
547     res_r0 = _mm_sad_epu8(src_r0, est_r0);
548     res_r1 = _mm_sad_epu8(src_r1, est_r1);
549     res_r2 = _mm_sad_epu8(src_r2, est_r2);
550     res_r3 = _mm_sad_epu8(src_r3, est_r3);
551 
552     sad_val = _mm_add_epi64(sad_val, res_r0);
553     sad_val = _mm_add_epi64(sad_val, res_r1);
554     sad_val = _mm_add_epi64(sad_val, res_r2);
555     sad_val = _mm_add_epi64(sad_val, res_r3);
556 
557     pu1_src = pu1_src_temp;
558     pu1_est = pu1_est_temp;
559 
560     val1 = _mm_extract_epi32(sad_val, 0);
561     val2 = _mm_extract_epi32(sad_val, 2);
562 
563     i4_sad = val1 + val2;
564     *pi4_mb_distortion = (i4_sad<<1);
565     return;
566 }
567 
568 /**
569 *******************************************************************************
570 *
571 * @brief compute sad
572 *
573 * @par Description: This function computes the sad at vertices of diamond grid
574 * centered at reference pointer and at unit distance from it.
575 *
576 * @param[in] pu1_ref
577 *  UWORD8 pointer to the reference
578 *
579 * @param[out] pu1_src
580 *  UWORD8 pointer to the source
581 *
582 * @param[in] ref_strd
583 *  integer reference stride
584 *
585 * @param[in] src_strd
586 *  integer source stride
587 *
588 * @param[out] pi4_sad
589 *  pointer to integer array evaluated sad
590 *
591 * @returns  sad at all evaluated vertexes
592 *
593 * @remarks  none
594 *
595 *******************************************************************************
596 */
ime_calculate_sad4_prog_sse42(UWORD8 * pu1_ref,UWORD8 * pu1_src,WORD32 ref_strd,WORD32 src_strd,WORD32 * pi4_sad)597 void ime_calculate_sad4_prog_sse42(UWORD8 *pu1_ref,
598                              UWORD8 *pu1_src,
599                              WORD32 ref_strd,
600                              WORD32 src_strd,
601                              WORD32 *pi4_sad)
602 {
603     /* reference ptrs at unit 1 distance in diamond pattern centered at pu1_ref */
604     UWORD8 *left_ptr    = pu1_ref - 1;
605     UWORD8 *right_ptr   = pu1_ref + 1;
606     UWORD8 *top_ptr     = pu1_ref - ref_strd;
607     UWORD8 *bot_ptr     = pu1_ref + ref_strd;
608 
609     WORD32 val1, val2;
610     __m128i src, ref_left, ref_right, ref_top, ref_bot;
611     __m128i res_r0, res_r1, res_r2, res_r3;
612     __m128i sad_r0, sad_r1, sad_r2, sad_r3;
613 
614     // Row 0 sad calculation
615     src = _mm_loadu_si128((__m128i *) (pu1_src));
616     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
617     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
618     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
619     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
620 
621     sad_r0 = _mm_sad_epu8(src, ref_left);
622     sad_r1 = _mm_sad_epu8(src, ref_right);
623     sad_r2 = _mm_sad_epu8(src, ref_top);
624     sad_r3 = _mm_sad_epu8(src, ref_bot);
625 
626     pu1_src += src_strd;
627     left_ptr += ref_strd;
628     right_ptr += ref_strd;
629     top_ptr += ref_strd;
630     bot_ptr += ref_strd;
631 
632     // Row 1 sad calculation
633     src = _mm_loadu_si128((__m128i *) (pu1_src));
634     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
635     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
636     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
637     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
638 
639     res_r0 = _mm_sad_epu8(src, ref_left);
640     res_r1 = _mm_sad_epu8(src, ref_right);
641     res_r2 = _mm_sad_epu8(src, ref_top);
642     res_r3 = _mm_sad_epu8(src, ref_bot);
643 
644     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
645     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
646     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
647     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
648 
649     pu1_src += src_strd;
650     left_ptr += ref_strd;
651     right_ptr += ref_strd;
652     top_ptr += ref_strd;
653     bot_ptr += ref_strd;
654 
655     // Row 2 sad calculation
656     src = _mm_loadu_si128((__m128i *) (pu1_src));
657     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
658     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
659     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
660     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
661 
662     res_r0 = _mm_sad_epu8(src, ref_left);
663     res_r1 = _mm_sad_epu8(src, ref_right);
664     res_r2 = _mm_sad_epu8(src, ref_top);
665     res_r3 = _mm_sad_epu8(src, ref_bot);
666 
667     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
668     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
669     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
670     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
671 
672     pu1_src += src_strd;
673     left_ptr += ref_strd;
674     right_ptr += ref_strd;
675     top_ptr += ref_strd;
676     bot_ptr += ref_strd;
677 
678     // Row 3 sad calculation
679     src = _mm_loadu_si128((__m128i *) (pu1_src));
680     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
681     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
682     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
683     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
684 
685     res_r0 = _mm_sad_epu8(src, ref_left);
686     res_r1 = _mm_sad_epu8(src, ref_right);
687     res_r2 = _mm_sad_epu8(src, ref_top);
688     res_r3 = _mm_sad_epu8(src, ref_bot);
689 
690     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
691     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
692     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
693     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
694 
695     pu1_src += src_strd;
696     left_ptr += ref_strd;
697     right_ptr += ref_strd;
698     top_ptr += ref_strd;
699     bot_ptr += ref_strd;
700 
701     // Row 4 sad calculation
702     src = _mm_loadu_si128((__m128i *) (pu1_src));
703     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
704     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
705     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
706     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
707 
708     res_r0 = _mm_sad_epu8(src, ref_left);
709     res_r1 = _mm_sad_epu8(src, ref_right);
710     res_r2 = _mm_sad_epu8(src, ref_top);
711     res_r3 = _mm_sad_epu8(src, ref_bot);
712 
713     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
714     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
715     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
716     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
717 
718     pu1_src += src_strd;
719     left_ptr += ref_strd;
720     right_ptr += ref_strd;
721     top_ptr += ref_strd;
722     bot_ptr += ref_strd;
723 
724     // Row 5 sad calculation
725     src = _mm_loadu_si128((__m128i *) (pu1_src));
726     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
727     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
728     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
729     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
730 
731     res_r0 = _mm_sad_epu8(src, ref_left);
732     res_r1 = _mm_sad_epu8(src, ref_right);
733     res_r2 = _mm_sad_epu8(src, ref_top);
734     res_r3 = _mm_sad_epu8(src, ref_bot);
735 
736     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
737     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
738     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
739     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
740 
741     pu1_src += src_strd;
742     left_ptr += ref_strd;
743     right_ptr += ref_strd;
744     top_ptr += ref_strd;
745     bot_ptr += ref_strd;
746 
747     // Row 6 sad calculation
748     src = _mm_loadu_si128((__m128i *) (pu1_src));
749     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
750     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
751     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
752     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
753 
754     res_r0 = _mm_sad_epu8(src, ref_left);
755     res_r1 = _mm_sad_epu8(src, ref_right);
756     res_r2 = _mm_sad_epu8(src, ref_top);
757     res_r3 = _mm_sad_epu8(src, ref_bot);
758 
759     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
760     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
761     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
762     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
763 
764     pu1_src += src_strd;
765     left_ptr += ref_strd;
766     right_ptr += ref_strd;
767     top_ptr += ref_strd;
768     bot_ptr += ref_strd;
769 
770     // Row 7 sad calculation
771     src = _mm_loadu_si128((__m128i *) (pu1_src));
772     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
773     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
774     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
775     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
776 
777     res_r0 = _mm_sad_epu8(src, ref_left);
778     res_r1 = _mm_sad_epu8(src, ref_right);
779     res_r2 = _mm_sad_epu8(src, ref_top);
780     res_r3 = _mm_sad_epu8(src, ref_bot);
781 
782     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
783     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
784     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
785     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
786 
787     pu1_src += src_strd;
788     left_ptr += ref_strd;
789     right_ptr += ref_strd;
790     top_ptr += ref_strd;
791     bot_ptr += ref_strd;
792 
793     // Row 8 sad calculation
794     src = _mm_loadu_si128((__m128i *) (pu1_src));
795     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
796     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
797     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
798     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
799 
800     res_r0 = _mm_sad_epu8(src, ref_left);
801     res_r1 = _mm_sad_epu8(src, ref_right);
802     res_r2 = _mm_sad_epu8(src, ref_top);
803     res_r3 = _mm_sad_epu8(src, ref_bot);
804 
805     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
806     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
807     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
808     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
809 
810     pu1_src += src_strd;
811     left_ptr += ref_strd;
812     right_ptr += ref_strd;
813     top_ptr += ref_strd;
814     bot_ptr += ref_strd;
815 
816     // Row 9 sad calculation
817     src = _mm_loadu_si128((__m128i *) (pu1_src));
818     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
819     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
820     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
821     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
822 
823     res_r0 = _mm_sad_epu8(src, ref_left);
824     res_r1 = _mm_sad_epu8(src, ref_right);
825     res_r2 = _mm_sad_epu8(src, ref_top);
826     res_r3 = _mm_sad_epu8(src, ref_bot);
827 
828     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
829     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
830     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
831     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
832 
833     pu1_src += src_strd;
834     left_ptr += ref_strd;
835     right_ptr += ref_strd;
836     top_ptr += ref_strd;
837     bot_ptr += ref_strd;
838 
839     // Row 10 sad calculation
840     src = _mm_loadu_si128((__m128i *) (pu1_src));
841     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
842     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
843     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
844     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
845 
846     res_r0 = _mm_sad_epu8(src, ref_left);
847     res_r1 = _mm_sad_epu8(src, ref_right);
848     res_r2 = _mm_sad_epu8(src, ref_top);
849     res_r3 = _mm_sad_epu8(src, ref_bot);
850 
851     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
852     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
853     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
854     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
855 
856     pu1_src += src_strd;
857     left_ptr += ref_strd;
858     right_ptr += ref_strd;
859     top_ptr += ref_strd;
860     bot_ptr += ref_strd;
861 
862     // Row 11 sad calculation
863     src = _mm_loadu_si128((__m128i *) (pu1_src));
864     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
865     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
866     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
867     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
868 
869     res_r0 = _mm_sad_epu8(src, ref_left);
870     res_r1 = _mm_sad_epu8(src, ref_right);
871     res_r2 = _mm_sad_epu8(src, ref_top);
872     res_r3 = _mm_sad_epu8(src, ref_bot);
873 
874     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
875     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
876     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
877     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
878 
879     pu1_src += src_strd;
880     left_ptr += ref_strd;
881     right_ptr += ref_strd;
882     top_ptr += ref_strd;
883     bot_ptr += ref_strd;
884 
885     // Row 12 sad calculation
886     src = _mm_loadu_si128((__m128i *) (pu1_src));
887     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
888     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
889     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
890     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
891 
892     res_r0 = _mm_sad_epu8(src, ref_left);
893     res_r1 = _mm_sad_epu8(src, ref_right);
894     res_r2 = _mm_sad_epu8(src, ref_top);
895     res_r3 = _mm_sad_epu8(src, ref_bot);
896 
897     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
898     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
899     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
900     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
901 
902     pu1_src += src_strd;
903     left_ptr += ref_strd;
904     right_ptr += ref_strd;
905     top_ptr += ref_strd;
906     bot_ptr += ref_strd;
907 
908     // Row 13 sad calculation
909     src = _mm_loadu_si128((__m128i *) (pu1_src));
910     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
911     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
912     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
913     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
914 
915     res_r0 = _mm_sad_epu8(src, ref_left);
916     res_r1 = _mm_sad_epu8(src, ref_right);
917     res_r2 = _mm_sad_epu8(src, ref_top);
918     res_r3 = _mm_sad_epu8(src, ref_bot);
919 
920     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
921     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
922     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
923     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
924 
925     pu1_src += src_strd;
926     left_ptr += ref_strd;
927     right_ptr += ref_strd;
928     top_ptr += ref_strd;
929     bot_ptr += ref_strd;
930 
931     // Row 14 sad calculation
932     src = _mm_loadu_si128((__m128i *) (pu1_src));
933     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
934     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
935     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
936     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
937 
938     res_r0 = _mm_sad_epu8(src, ref_left);
939     res_r1 = _mm_sad_epu8(src, ref_right);
940     res_r2 = _mm_sad_epu8(src, ref_top);
941     res_r3 = _mm_sad_epu8(src, ref_bot);
942 
943     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
944     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
945     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
946     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
947 
948     pu1_src += src_strd;
949     left_ptr += ref_strd;
950     right_ptr += ref_strd;
951     top_ptr += ref_strd;
952     bot_ptr += ref_strd;
953 
954     // Row 15 sad calculation
955     src = _mm_loadu_si128((__m128i *) (pu1_src));
956     ref_left = _mm_loadu_si128((__m128i *) (left_ptr));
957     ref_right = _mm_loadu_si128((__m128i *) (right_ptr));
958     ref_top = _mm_loadu_si128((__m128i *) (top_ptr));
959     ref_bot = _mm_loadu_si128((__m128i *) (bot_ptr));
960 
961     res_r0 = _mm_sad_epu8(src, ref_left);
962     res_r1 = _mm_sad_epu8(src, ref_right);
963     res_r2 = _mm_sad_epu8(src, ref_top);
964     res_r3 = _mm_sad_epu8(src, ref_bot);
965 
966     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
967     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
968     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
969     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
970 
971     val1 = _mm_extract_epi32(sad_r0, 0);
972     val2 = _mm_extract_epi32(sad_r0, 2);
973     pi4_sad[0] = (val1 + val2);
974 
975     val1 = _mm_extract_epi32(sad_r1, 0);
976     val2 = _mm_extract_epi32(sad_r1, 2);
977     pi4_sad[1] = (val1 + val2);
978 
979     val1 = _mm_extract_epi32(sad_r2, 0);
980     val2 = _mm_extract_epi32(sad_r2, 2);
981     pi4_sad[2] = (val1 + val2);
982 
983     val1 = _mm_extract_epi32(sad_r3, 0);
984     val2 = _mm_extract_epi32(sad_r3, 2);
985     pi4_sad[3] = (val1 + val2);
986 }
987 
988 /**
989 ******************************************************************************
990 *
991 * @brief computes distortion (SAD) at all subpel points about the src location
992 *
993 * @par Description
994 *   This functions computes SAD at all points at a subpel distance from the
995 *   current source location.
996 *
997 * @param[in] pu1_src
998 *  UWORD8 pointer to the source
999 *
1000 * @param[out] pu1_ref_half_x
1001 *  UWORD8 pointer to half pel buffer
1002 *
1003 * @param[out] pu1_ref_half_y
1004 *  UWORD8 pointer to half pel buffer
1005 *
1006 * @param[out] pu1_ref_half_xy
1007 *  UWORD8 pointer to half pel buffer
1008 *
1009 * @param[in] src_strd
1010 *  integer source stride
1011 *
1012 * @param[in] ref_strd
1013 *  integer ref stride
1014 *
1015 * @param[out] pi4_sad
1016 *  integer evaluated sad
1017 *  pi4_sad[0] - half x
1018 *  pi4_sad[1] - half x - 1
1019 *  pi4_sad[2] - half y
1020 *  pi4_sad[3] - half y - 1
1021 *  pi4_sad[4] - half xy
1022 *  pi4_sad[5] - half xy - 1
1023 *  pi4_sad[6] - half xy - strd
1024 *  pi4_sad[7] - half xy - 1 - strd
1025 *
1026 * @remarks
1027 *
1028 ******************************************************************************
1029 */
ime_sub_pel_compute_sad_16x16_sse42(UWORD8 * pu1_src,UWORD8 * pu1_ref_half_x,UWORD8 * pu1_ref_half_y,UWORD8 * pu1_ref_half_xy,WORD32 src_strd,WORD32 ref_strd,WORD32 * pi4_sad)1030 void ime_sub_pel_compute_sad_16x16_sse42(UWORD8 *pu1_src,
1031                                    UWORD8 *pu1_ref_half_x,
1032                                    UWORD8 *pu1_ref_half_y,
1033                                    UWORD8 *pu1_ref_half_xy,
1034                                    WORD32 src_strd,
1035                                    WORD32 ref_strd,
1036                                    WORD32 *pi4_sad)
1037 {
1038     UWORD8 *pu1_ref_half_x_left = pu1_ref_half_x - 1;
1039     UWORD8 *pu1_ref_half_y_top = pu1_ref_half_y - ref_strd;
1040     UWORD8 *pu1_ref_half_xy_left = pu1_ref_half_xy - 1;
1041     UWORD8 *pu1_ref_half_xy_top = pu1_ref_half_xy - ref_strd;
1042     UWORD8 *pu1_ref_half_xy_top_left = pu1_ref_half_xy - ref_strd - 1;
1043     WORD32 val1, val2;
1044 
1045     __m128i src, ref_half_x, ref_half_y, ref_half_xy;
1046     __m128i ref_half_x_left, ref_half_y_top, ref_half_xy_left, ref_half_xy_top, ref_half_xy_top_left;
1047     __m128i res_r0, res_r1, res_r2, res_r3, res_r4, res_r5, res_r6, res_r7;
1048     __m128i sad_r0, sad_r1, sad_r2, sad_r3, sad_r4, sad_r5, sad_r6, sad_r7;
1049     // Row 0 sad calculation
1050     src = _mm_loadu_si128((__m128i *) (pu1_src));
1051     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1052     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1053     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1054     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1055     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1056     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1057     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1058     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1059 
1060     sad_r0 = _mm_sad_epu8(src, ref_half_x);
1061     sad_r1 = _mm_sad_epu8(src, ref_half_x_left);
1062     sad_r2 = _mm_sad_epu8(src, ref_half_y);
1063     sad_r3 = _mm_sad_epu8(src, ref_half_y_top);
1064     sad_r4 = _mm_sad_epu8(src, ref_half_xy);
1065     sad_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1066     sad_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1067     sad_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1068 
1069     pu1_src += src_strd;
1070     pu1_ref_half_x += ref_strd;
1071     pu1_ref_half_x_left += ref_strd;
1072     pu1_ref_half_y += ref_strd;
1073     pu1_ref_half_y_top += ref_strd;
1074     pu1_ref_half_xy += ref_strd;
1075     pu1_ref_half_xy_left += ref_strd;
1076     pu1_ref_half_xy_top += ref_strd;
1077     pu1_ref_half_xy_top_left += ref_strd;
1078 
1079     // Row 1 sad calculation
1080     src = _mm_loadu_si128((__m128i *) (pu1_src));
1081     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1082     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1083     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1084     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1085     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1086     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1087     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1088     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1089 
1090     res_r0 = _mm_sad_epu8(src, ref_half_x);
1091     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1092     res_r2 = _mm_sad_epu8(src, ref_half_y);
1093     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1094     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1095     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1096     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1097     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1098 
1099     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1100     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1101     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1102     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1103     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1104     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1105     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1106     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1107 
1108     pu1_src += src_strd;
1109     pu1_ref_half_x += ref_strd;
1110     pu1_ref_half_x_left += ref_strd;
1111     pu1_ref_half_y += ref_strd;
1112     pu1_ref_half_y_top += ref_strd;
1113     pu1_ref_half_xy += ref_strd;
1114     pu1_ref_half_xy_left += ref_strd;
1115     pu1_ref_half_xy_top += ref_strd;
1116     pu1_ref_half_xy_top_left += ref_strd;
1117 
1118     // Row 2 sad calculation
1119     src = _mm_loadu_si128((__m128i *) (pu1_src));
1120     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1121     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1122     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1123     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1124     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1125     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1126     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1127     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1128 
1129     res_r0 = _mm_sad_epu8(src, ref_half_x);
1130     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1131     res_r2 = _mm_sad_epu8(src, ref_half_y);
1132     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1133     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1134     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1135     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1136     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1137 
1138     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1139     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1140     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1141     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1142     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1143     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1144     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1145     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1146 
1147     pu1_src += src_strd;
1148     pu1_ref_half_x += ref_strd;
1149     pu1_ref_half_x_left += ref_strd;
1150     pu1_ref_half_y += ref_strd;
1151     pu1_ref_half_y_top += ref_strd;
1152     pu1_ref_half_xy += ref_strd;
1153     pu1_ref_half_xy_left += ref_strd;
1154     pu1_ref_half_xy_top += ref_strd;
1155     pu1_ref_half_xy_top_left += ref_strd;
1156 
1157     // Row 3 sad calculation
1158     src = _mm_loadu_si128((__m128i *) (pu1_src));
1159     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1160     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1161     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1162     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1163     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1164     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1165     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1166     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1167 
1168     res_r0 = _mm_sad_epu8(src, ref_half_x);
1169     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1170     res_r2 = _mm_sad_epu8(src, ref_half_y);
1171     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1172     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1173     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1174     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1175     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1176 
1177     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1178     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1179     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1180     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1181     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1182     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1183     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1184     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1185 
1186     pu1_src += src_strd;
1187     pu1_ref_half_x += ref_strd;
1188     pu1_ref_half_x_left += ref_strd;
1189     pu1_ref_half_y += ref_strd;
1190     pu1_ref_half_y_top += ref_strd;
1191     pu1_ref_half_xy += ref_strd;
1192     pu1_ref_half_xy_left += ref_strd;
1193     pu1_ref_half_xy_top += ref_strd;
1194     pu1_ref_half_xy_top_left += ref_strd;
1195 
1196     // Row 4 sad calculation
1197     src = _mm_loadu_si128((__m128i *) (pu1_src));
1198     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1199     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1200     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1201     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1202     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1203     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1204     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1205     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1206 
1207     res_r0 = _mm_sad_epu8(src, ref_half_x);
1208     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1209     res_r2 = _mm_sad_epu8(src, ref_half_y);
1210     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1211     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1212     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1213     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1214     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1215 
1216     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1217     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1218     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1219     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1220     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1221     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1222     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1223     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1224 
1225     pu1_src += src_strd;
1226     pu1_ref_half_x += ref_strd;
1227     pu1_ref_half_x_left += ref_strd;
1228     pu1_ref_half_y += ref_strd;
1229     pu1_ref_half_y_top += ref_strd;
1230     pu1_ref_half_xy += ref_strd;
1231     pu1_ref_half_xy_left += ref_strd;
1232     pu1_ref_half_xy_top += ref_strd;
1233     pu1_ref_half_xy_top_left += ref_strd;
1234 
1235 
1236     // Row 5 sad calculation
1237     src = _mm_loadu_si128((__m128i *) (pu1_src));
1238     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1239     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1240     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1241     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1242     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1243     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1244     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1245     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1246 
1247     res_r0 = _mm_sad_epu8(src, ref_half_x);
1248     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1249     res_r2 = _mm_sad_epu8(src, ref_half_y);
1250     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1251     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1252     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1253     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1254     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1255 
1256     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1257     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1258     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1259     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1260     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1261     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1262     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1263     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1264 
1265     pu1_src += src_strd;
1266     pu1_ref_half_x += ref_strd;
1267     pu1_ref_half_x_left += ref_strd;
1268     pu1_ref_half_y += ref_strd;
1269     pu1_ref_half_y_top += ref_strd;
1270     pu1_ref_half_xy += ref_strd;
1271     pu1_ref_half_xy_left += ref_strd;
1272     pu1_ref_half_xy_top += ref_strd;
1273     pu1_ref_half_xy_top_left += ref_strd;
1274 
1275     // Row 6 sad calculation
1276     src = _mm_loadu_si128((__m128i *) (pu1_src));
1277     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1278     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1279     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1280     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1281     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1282     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1283     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1284     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1285 
1286     res_r0 = _mm_sad_epu8(src, ref_half_x);
1287     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1288     res_r2 = _mm_sad_epu8(src, ref_half_y);
1289     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1290     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1291     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1292     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1293     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1294 
1295     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1296     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1297     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1298     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1299     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1300     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1301     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1302     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1303 
1304     pu1_src += src_strd;
1305     pu1_ref_half_x += ref_strd;
1306     pu1_ref_half_x_left += ref_strd;
1307     pu1_ref_half_y += ref_strd;
1308     pu1_ref_half_y_top += ref_strd;
1309     pu1_ref_half_xy += ref_strd;
1310     pu1_ref_half_xy_left += ref_strd;
1311     pu1_ref_half_xy_top += ref_strd;
1312     pu1_ref_half_xy_top_left += ref_strd;
1313 
1314     // Row 7 sad calculation
1315     src = _mm_loadu_si128((__m128i *) (pu1_src));
1316     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1317     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1318     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1319     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1320     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1321     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1322     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1323     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1324 
1325     res_r0 = _mm_sad_epu8(src, ref_half_x);
1326     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1327     res_r2 = _mm_sad_epu8(src, ref_half_y);
1328     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1329     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1330     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1331     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1332     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1333 
1334     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1335     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1336     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1337     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1338     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1339     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1340     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1341     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1342 
1343     pu1_src += src_strd;
1344     pu1_ref_half_x += ref_strd;
1345     pu1_ref_half_x_left += ref_strd;
1346     pu1_ref_half_y += ref_strd;
1347     pu1_ref_half_y_top += ref_strd;
1348     pu1_ref_half_xy += ref_strd;
1349     pu1_ref_half_xy_left += ref_strd;
1350     pu1_ref_half_xy_top += ref_strd;
1351     pu1_ref_half_xy_top_left += ref_strd;
1352 
1353     // Row 8 sad calculation
1354     src = _mm_loadu_si128((__m128i *) (pu1_src));
1355     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1356     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1357     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1358     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1359     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1360     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1361     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1362     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1363 
1364     res_r0 = _mm_sad_epu8(src, ref_half_x);
1365     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1366     res_r2 = _mm_sad_epu8(src, ref_half_y);
1367     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1368     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1369     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1370     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1371     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1372 
1373     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1374     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1375     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1376     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1377     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1378     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1379     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1380     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1381 
1382     pu1_src += src_strd;
1383     pu1_ref_half_x += ref_strd;
1384     pu1_ref_half_x_left += ref_strd;
1385     pu1_ref_half_y += ref_strd;
1386     pu1_ref_half_y_top += ref_strd;
1387     pu1_ref_half_xy += ref_strd;
1388     pu1_ref_half_xy_left += ref_strd;
1389     pu1_ref_half_xy_top += ref_strd;
1390     pu1_ref_half_xy_top_left += ref_strd;
1391 
1392     // Row 9 sad calculation
1393     src = _mm_loadu_si128((__m128i *) (pu1_src));
1394     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1395     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1396     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1397     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1398     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1399     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1400     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1401     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1402 
1403     res_r0 = _mm_sad_epu8(src, ref_half_x);
1404     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1405     res_r2 = _mm_sad_epu8(src, ref_half_y);
1406     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1407     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1408     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1409     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1410     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1411 
1412     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1413     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1414     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1415     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1416     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1417     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1418     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1419     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1420 
1421     pu1_src += src_strd;
1422     pu1_ref_half_x += ref_strd;
1423     pu1_ref_half_x_left += ref_strd;
1424     pu1_ref_half_y += ref_strd;
1425     pu1_ref_half_y_top += ref_strd;
1426     pu1_ref_half_xy += ref_strd;
1427     pu1_ref_half_xy_left += ref_strd;
1428     pu1_ref_half_xy_top += ref_strd;
1429     pu1_ref_half_xy_top_left += ref_strd;
1430 
1431     // Row 10 sad calculation
1432     src = _mm_loadu_si128((__m128i *) (pu1_src));
1433     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1434     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1435     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1436     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1437     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1438     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1439     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1440     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1441 
1442     res_r0 = _mm_sad_epu8(src, ref_half_x);
1443     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1444     res_r2 = _mm_sad_epu8(src, ref_half_y);
1445     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1446     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1447     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1448     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1449     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1450 
1451     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1452     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1453     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1454     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1455     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1456     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1457     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1458     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1459 
1460     pu1_src += src_strd;
1461     pu1_ref_half_x += ref_strd;
1462     pu1_ref_half_x_left += ref_strd;
1463     pu1_ref_half_y += ref_strd;
1464     pu1_ref_half_y_top += ref_strd;
1465     pu1_ref_half_xy += ref_strd;
1466     pu1_ref_half_xy_left += ref_strd;
1467     pu1_ref_half_xy_top += ref_strd;
1468     pu1_ref_half_xy_top_left += ref_strd;
1469 
1470     // Row 11 sad calculation
1471     src = _mm_loadu_si128((__m128i *) (pu1_src));
1472     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1473     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1474     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1475     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1476     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1477     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1478     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1479     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1480 
1481     res_r0 = _mm_sad_epu8(src, ref_half_x);
1482     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1483     res_r2 = _mm_sad_epu8(src, ref_half_y);
1484     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1485     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1486     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1487     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1488     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1489 
1490     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1491     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1492     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1493     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1494     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1495     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1496     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1497     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1498 
1499     pu1_src += src_strd;
1500     pu1_ref_half_x += ref_strd;
1501     pu1_ref_half_x_left += ref_strd;
1502     pu1_ref_half_y += ref_strd;
1503     pu1_ref_half_y_top += ref_strd;
1504     pu1_ref_half_xy += ref_strd;
1505     pu1_ref_half_xy_left += ref_strd;
1506     pu1_ref_half_xy_top += ref_strd;
1507     pu1_ref_half_xy_top_left += ref_strd;
1508 
1509     // Row 12 sad calculation
1510     src = _mm_loadu_si128((__m128i *) (pu1_src));
1511     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1512     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1513     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1514     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1515     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1516     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1517     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1518     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1519 
1520     res_r0 = _mm_sad_epu8(src, ref_half_x);
1521     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1522     res_r2 = _mm_sad_epu8(src, ref_half_y);
1523     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1524     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1525     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1526     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1527     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1528 
1529     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1530     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1531     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1532     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1533     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1534     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1535     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1536     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1537 
1538     pu1_src += src_strd;
1539     pu1_ref_half_x += ref_strd;
1540     pu1_ref_half_x_left += ref_strd;
1541     pu1_ref_half_y += ref_strd;
1542     pu1_ref_half_y_top += ref_strd;
1543     pu1_ref_half_xy += ref_strd;
1544     pu1_ref_half_xy_left += ref_strd;
1545     pu1_ref_half_xy_top += ref_strd;
1546     pu1_ref_half_xy_top_left += ref_strd;
1547 
1548     // Row 13 sad calculation
1549     src = _mm_loadu_si128((__m128i *) (pu1_src));
1550     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1551     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1552     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1553     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1554     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1555     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1556     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1557     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1558 
1559     res_r0 = _mm_sad_epu8(src, ref_half_x);
1560     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1561     res_r2 = _mm_sad_epu8(src, ref_half_y);
1562     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1563     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1564     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1565     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1566     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1567 
1568     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1569     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1570     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1571     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1572     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1573     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1574     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1575     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1576 
1577     pu1_src += src_strd;
1578     pu1_ref_half_x += ref_strd;
1579     pu1_ref_half_x_left += ref_strd;
1580     pu1_ref_half_y += ref_strd;
1581     pu1_ref_half_y_top += ref_strd;
1582     pu1_ref_half_xy += ref_strd;
1583     pu1_ref_half_xy_left += ref_strd;
1584     pu1_ref_half_xy_top += ref_strd;
1585     pu1_ref_half_xy_top_left += ref_strd;
1586 
1587     // Row 14 sad calculation
1588     src = _mm_loadu_si128((__m128i *) (pu1_src));
1589     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1590     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1591     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1592     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1593     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1594     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1595     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1596     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1597 
1598     res_r0 = _mm_sad_epu8(src, ref_half_x);
1599     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1600     res_r2 = _mm_sad_epu8(src, ref_half_y);
1601     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1602     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1603     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1604     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1605     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1606 
1607     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1608     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1609     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1610     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1611     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1612     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1613     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1614     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1615 
1616     pu1_src += src_strd;
1617     pu1_ref_half_x += ref_strd;
1618     pu1_ref_half_x_left += ref_strd;
1619     pu1_ref_half_y += ref_strd;
1620     pu1_ref_half_y_top += ref_strd;
1621     pu1_ref_half_xy += ref_strd;
1622     pu1_ref_half_xy_left += ref_strd;
1623     pu1_ref_half_xy_top += ref_strd;
1624     pu1_ref_half_xy_top_left += ref_strd;
1625 
1626     // Row 15 sad calculation
1627     src = _mm_loadu_si128((__m128i *) (pu1_src));
1628     ref_half_x = _mm_loadu_si128((__m128i *) (pu1_ref_half_x));
1629     ref_half_y = _mm_loadu_si128((__m128i *) (pu1_ref_half_y));
1630     ref_half_xy = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy));
1631     ref_half_x_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_x_left));
1632     ref_half_y_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_y_top));
1633     ref_half_xy_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_left));
1634     ref_half_xy_top = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top));
1635     ref_half_xy_top_left = _mm_loadu_si128((__m128i *) (pu1_ref_half_xy_top_left));
1636 
1637     res_r0 = _mm_sad_epu8(src, ref_half_x);
1638     res_r1 = _mm_sad_epu8(src, ref_half_x_left);
1639     res_r2 = _mm_sad_epu8(src, ref_half_y);
1640     res_r3 = _mm_sad_epu8(src, ref_half_y_top);
1641     res_r4 = _mm_sad_epu8(src, ref_half_xy);
1642     res_r5 = _mm_sad_epu8(src, ref_half_xy_left);
1643     res_r6 = _mm_sad_epu8(src, ref_half_xy_top);
1644     res_r7 = _mm_sad_epu8(src, ref_half_xy_top_left);
1645 
1646     sad_r0 = _mm_add_epi64(sad_r0, res_r0);
1647     sad_r1 = _mm_add_epi64(sad_r1, res_r1);
1648     sad_r2 = _mm_add_epi64(sad_r2, res_r2);
1649     sad_r3 = _mm_add_epi64(sad_r3, res_r3);
1650     sad_r4 = _mm_add_epi64(sad_r4, res_r4);
1651     sad_r5 = _mm_add_epi64(sad_r5, res_r5);
1652     sad_r6 = _mm_add_epi64(sad_r6, res_r6);
1653     sad_r7 = _mm_add_epi64(sad_r7, res_r7);
1654 
1655     val1 = _mm_extract_epi32(sad_r0, 0);
1656     val2 = _mm_extract_epi32(sad_r0, 2);
1657     pi4_sad[0] = (val1 + val2);
1658 
1659     val1 = _mm_extract_epi32(sad_r1, 0);
1660     val2 = _mm_extract_epi32(sad_r1, 2);
1661     pi4_sad[1] = (val1 + val2);
1662 
1663     val1 = _mm_extract_epi32(sad_r2, 0);
1664     val2 = _mm_extract_epi32(sad_r2, 2);
1665     pi4_sad[2] = (val1 + val2);
1666 
1667     val1 = _mm_extract_epi32(sad_r3, 0);
1668     val2 = _mm_extract_epi32(sad_r3, 2);
1669     pi4_sad[3] = (val1 + val2);
1670 
1671     val1 = _mm_extract_epi32(sad_r4, 0);
1672     val2 = _mm_extract_epi32(sad_r4, 2);
1673     pi4_sad[4] = (val1 + val2);
1674 
1675     val1 = _mm_extract_epi32(sad_r5, 0);
1676     val2 = _mm_extract_epi32(sad_r5, 2);
1677     pi4_sad[5] = (val1 + val2);
1678 
1679     val1 = _mm_extract_epi32(sad_r6, 0);
1680     val2 = _mm_extract_epi32(sad_r6, 2);
1681     pi4_sad[6] = (val1 + val2);
1682 
1683     val1 = _mm_extract_epi32(sad_r7, 0);
1684     val2 = _mm_extract_epi32(sad_r7, 2);
1685     pi4_sad[7] = (val1 + val2);
1686 
1687     return;
1688 }
1689 /*
1690 *
1691 * @brief This function computes SAD between two 16x16 blocks
1692 *        It also computes if the block will be zero after H264 transform and quant for
1693 *        Intra 16x16 blocks
1694 *
1695 * @param[in] pu1_src
1696 *  UWORD8 pointer to the source
1697 *
1698 * @param[out] pu1_dst
1699 *  UWORD8 pointer to the destination
1700 *
1701 * @param[in] src_strd
1702 *  integer source stride
1703 *
1704 * @param[in] dst_strd
1705 *  integer destination stride
1706 *
1707 * @param[in] pu2_thrsh
1708 *  Threshold for each element of transofrmed quantized block
1709 *
1710 * @param[out] pi4_mb_distortion
1711 *  integer evaluated sad
1712 *
1713 * @param[out] pu4_is_zero
1714 *  Poitner to store if the block is zero after transform and quantization
1715 *
1716 * @remarks
1717 *
1718 ******************************************************************************
1719 */
ime_compute_satqd_16x16_lumainter_sse42(UWORD8 * pu1_src,UWORD8 * pu1_est,WORD32 src_strd,WORD32 est_strd,UWORD16 * pu2_thrsh,WORD32 * pi4_mb_distortion,UWORD32 * pu4_is_zero)1720 void ime_compute_satqd_16x16_lumainter_sse42(UWORD8 *pu1_src,
1721                                          UWORD8 *pu1_est,
1722                                          WORD32 src_strd,
1723                                          WORD32 est_strd,
1724                                          UWORD16 *pu2_thrsh,
1725                                          WORD32 *pi4_mb_distortion,
1726                                          UWORD32 *pu4_is_zero)
1727 {
1728     __m128i src_r0, src_r1, src_r2, src_r3;
1729     __m128i est_r0, est_r1, est_r2, est_r3;
1730     __m128i temp0, temp1, temp2, temp3, temp4;
1731     __m128i zero = _mm_setzero_si128();          // all bits reset to zero
1732     __m128i all_one = _mm_set1_epi8(0xFF);
1733     __m128i sad_b1, sad_b2, threshold;
1734     WORD16 sad_1, sad_2;
1735     WORD32 i;
1736     UWORD32 flag = 0;
1737     WORD32 test1, test2;
1738     threshold = _mm_loadu_si128((__m128i *) pu2_thrsh);
1739     (*pi4_mb_distortion) = 0;
1740 
1741     for (i=0; i<4; i++)
1742     {
1743         src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
1744         src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
1745         src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
1746         src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
1747 
1748         src_r0 = _mm_cvtepu8_epi16(src_r0);
1749         src_r1 = _mm_cvtepu8_epi16(src_r1);
1750         src_r2 = _mm_cvtepu8_epi16(src_r2);
1751         src_r3 = _mm_cvtepu8_epi16(src_r3);
1752 
1753         est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
1754         est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
1755         est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
1756         est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
1757 
1758         est_r0 = _mm_cvtepu8_epi16(est_r0);
1759         est_r1 = _mm_cvtepu8_epi16(est_r1);
1760         est_r2 = _mm_cvtepu8_epi16(est_r2);
1761         est_r3 = _mm_cvtepu8_epi16(est_r3);
1762 
1763         src_r0 = _mm_sub_epi16(src_r0, est_r0);
1764         src_r1 = _mm_sub_epi16(src_r1, est_r1);
1765         src_r2 = _mm_sub_epi16(src_r2, est_r2);
1766         src_r3 = _mm_sub_epi16(src_r3, est_r3);
1767 
1768         src_r0 = _mm_abs_epi16(src_r0);
1769         src_r1 = _mm_abs_epi16(src_r1);
1770         src_r2 = _mm_abs_epi16(src_r2);
1771         src_r3 = _mm_abs_epi16(src_r3);
1772 
1773         src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
1774         src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2
1775 
1776         //SAD calculation
1777         temp0 = _mm_add_epi16(src_r0, src_r1);      //s1+s2 s4+s3 s4+s3 s1+s2 a1+a2 a4+a3 a4+a3 a1+a2
1778         temp0 = _mm_hadd_epi16(temp0, zero);
1779         temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values
1780 
1781         sad_1 = _mm_extract_epi16(temp0, 0);
1782         sad_2 = _mm_extract_epi16(temp0, 1);
1783 
1784         (*pi4_mb_distortion) += sad_1 + sad_2;
1785 
1786         if (flag == 0) {
1787             sad_b1 = _mm_set1_epi16((sad_1 << 1));
1788             sad_b2 = _mm_set1_epi16((sad_2 << 1));
1789 
1790             src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
1791             src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
1792 
1793             src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
1794             src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
1795 
1796             src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
1797             src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0
1798 
1799             temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
1800             temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
1801 
1802             temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
1803             temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
1804 
1805             temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
1806             temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
1807 
1808             temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
1809 
1810             temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
1811             temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0
1812 
1813             temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
1814 
1815             temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
1816             temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
1817 
1818             temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
1819             temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
1820 
1821             sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
1822             sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1
1823 
1824             temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
1825 
1826             temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
1827 
1828             temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
1829             temp1 = _mm_xor_si128(temp1, all_one);
1830 
1831             test1 = _mm_test_all_zeros(temp0, all_one);
1832             test2 = _mm_test_all_zeros(temp1, all_one);
1833 
1834             if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
1835                     || pu2_thrsh[8] <= sad_2)
1836                 flag = 1;
1837         }
1838 
1839         pu1_src += 8;
1840         pu1_est += 8;
1841 
1842         src_r0 = _mm_loadl_epi64((__m128i *) pu1_src);  //Row 0 - Block1 and 2
1843         src_r1 = _mm_loadl_epi64((__m128i *) (pu1_src + src_strd)); //Row 1 - Block1 and 2
1844         src_r2 = _mm_loadl_epi64((__m128i *) (pu1_src + 2 * src_strd)); //Row 2 - Block1 and 2
1845         src_r3 = _mm_loadl_epi64((__m128i *) (pu1_src + 3 * src_strd)); //Row 3 - Block1 and 2
1846 
1847         src_r0 = _mm_cvtepu8_epi16(src_r0);
1848         src_r1 = _mm_cvtepu8_epi16(src_r1);
1849         src_r2 = _mm_cvtepu8_epi16(src_r2);
1850         src_r3 = _mm_cvtepu8_epi16(src_r3);
1851 
1852         est_r0 = _mm_loadl_epi64((__m128i *) pu1_est);
1853         est_r1 = _mm_loadl_epi64((__m128i *) (pu1_est + est_strd));
1854         est_r2 = _mm_loadl_epi64((__m128i *) (pu1_est + 2 * est_strd));
1855         est_r3 = _mm_loadl_epi64((__m128i *) (pu1_est + 3 * est_strd));
1856 
1857         est_r0 = _mm_cvtepu8_epi16(est_r0);
1858         est_r1 = _mm_cvtepu8_epi16(est_r1);
1859         est_r2 = _mm_cvtepu8_epi16(est_r2);
1860         est_r3 = _mm_cvtepu8_epi16(est_r3);
1861 
1862         src_r0 = _mm_sub_epi16(src_r0, est_r0);
1863         src_r1 = _mm_sub_epi16(src_r1, est_r1);
1864         src_r2 = _mm_sub_epi16(src_r2, est_r2);
1865         src_r3 = _mm_sub_epi16(src_r3, est_r3);
1866 
1867         src_r0 = _mm_abs_epi16(src_r0);
1868         src_r1 = _mm_abs_epi16(src_r1);
1869         src_r2 = _mm_abs_epi16(src_r2);
1870         src_r3 = _mm_abs_epi16(src_r3);
1871 
1872         src_r0 = _mm_add_epi16(src_r0, src_r3);     //s1 s4 s4 s1 a1 a4 a4 a1
1873         src_r1 = _mm_add_epi16(src_r1, src_r2);     //s2 s3 s3 s2 a2 a3 a3 a2
1874 
1875         //SAD calculation
1876         temp0 = _mm_add_epi16(src_r0, src_r1);
1877         temp0 = _mm_hadd_epi16(temp0, zero);
1878         temp0 = _mm_hadd_epi16(temp0, zero);        //sad1, sad2 - 16bit values
1879 
1880         sad_1 = _mm_extract_epi16(temp0, 0);
1881         sad_2 = _mm_extract_epi16(temp0, 1);
1882 
1883         (*pi4_mb_distortion) += sad_1 + sad_2;
1884 
1885         if (flag == 0) {
1886             sad_b1 = _mm_set1_epi16((sad_1 << 1));
1887             sad_b2 = _mm_set1_epi16((sad_2 << 1));
1888 
1889             src_r0 = _mm_shufflelo_epi16(src_r0, 0x9c); //Block 0 s1 s1 s4 s4 a1 a4 a4 a1
1890             src_r0 = _mm_shufflehi_epi16(src_r0, 0x9c); //Block 1 s1 s1 s4 s4 a1 a1 a4 a4
1891 
1892             src_r1 = _mm_shufflelo_epi16(src_r1, 0x9c); //Block 0 s2 s2 s3 s3 a2 a3 a3 a2
1893             src_r1 = _mm_shufflehi_epi16(src_r1, 0x9c); //Block 1 s2 s2 s3 s3 a2 a2 a3 a3
1894 
1895             src_r0 = _mm_hadd_epi16(src_r0, zero);      //s1 s4 a1 a4 0 0 0 0
1896             src_r1 = _mm_hadd_epi16(src_r1, zero);      //s2 s3 a2 a3 0 0 0 0
1897 
1898             temp0 = _mm_slli_epi16(src_r0, 1);//s1<<1 s4<<1 a1<<1 a4<<1 0 0 0 0
1899             temp1 = _mm_slli_epi16(src_r1, 1);//s2<<1 s3<<1 a2<<1 a3<<1 0 0 0 0
1900 
1901             temp0 = _mm_shufflelo_epi16(temp0, 0xb1);//s4<<1 s1<<1 a4<<1 a1<<1 0 0 0 0
1902             temp1 = _mm_shufflelo_epi16(temp1, 0xb1);//s3<<1 s2<<1 a3<<1 a2<<1 0 0 0 0
1903 
1904             temp2 = _mm_sub_epi16(src_r0, temp1);//(s1-s3<<1) (s4-s2<<1) (a1-a3<<1) (a4-a2<<1) 0 0 0 0
1905             temp3 = _mm_sub_epi16(src_r1, temp0);//(s2-s4<<1) (s3-s1<<1) (a2-a4<<1) (a3-a1<<1) 0 0 0 0
1906 
1907             temp4 = _mm_add_epi16(src_r0, src_r1);//s1+s2 s4+s3 a1+a2 a4+a3 0 0 0 0
1908 
1909             temp0 = _mm_hadd_epi16(src_r0, zero);   //s1+s4 a1+a4 0 0 0 0 0 0
1910             temp1 = _mm_hadd_epi16(src_r1, zero);   //s2+s3 a2+a3 0 0 0 0 0 0
1911 
1912             temp0 = _mm_unpacklo_epi16(temp0, temp1);//s1+s4 s2+s3 a1+a4 a2+a3 0 0 0 0
1913 
1914             temp0 = _mm_unpacklo_epi32(temp0, temp2);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1)
1915             temp1 = _mm_unpacklo_epi32(temp4, temp3);//s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1) a1+a2 a4+a3 (a2-a4<<1) (a3-a1<<1)
1916 
1917             temp2 = _mm_unpacklo_epi64(temp0, temp1);//s1+s4 s2+s3 (s1-s3<<1) (s4-s2<<1) s1+s2 s4+s3 (s2-s4<<1) (s3-s1<<1)
1918             temp3 = _mm_unpackhi_epi64(temp0, temp1); //a1+a4 a2+a3 (a1-a3<<1) (a4-a2<<1) a1+a2 a4+a3 (s2-s4<<1) (s3-s1<<1)
1919 
1920             sad_b1 = _mm_sub_epi16(sad_b1, temp2);      //lsi values Block0
1921             sad_b2 = _mm_sub_epi16(sad_b2, temp3);      //lsi values Block1
1922 
1923             temp0 = _mm_cmpgt_epi16(threshold, sad_b1); //if any threshold[i]>ls[i], corresponding 16-bit value in temp becomes 0xffff
1924 
1925             temp1 = _mm_cmpgt_epi16(threshold, sad_b2);
1926 
1927             temp0 = _mm_xor_si128(temp0, all_one);      //Xor with 1 => NOT operation
1928             temp1 = _mm_xor_si128(temp1, all_one);
1929 
1930             test1 = _mm_test_all_zeros(temp0, all_one);
1931             test2 = _mm_test_all_zeros(temp1, all_one);
1932 
1933             if (test1 == 0 || test2 == 0 || pu2_thrsh[8] <= sad_1
1934                     || pu2_thrsh[8] <= sad_2)
1935                 flag = 1;
1936         }
1937 
1938         pu1_src += 4*src_strd - 8;
1939         pu1_est += 4*est_strd - 8;
1940     }
1941 
1942         *pu4_is_zero = flag;
1943 }
1944