1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  icv_sad.c
24 *
25 * @brief
26 *  This file contains the functions to compute SAD
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *  icv_sad_8x4_ssse3()
33 *
34 * @remarks
35 *  None
36 *
37 *******************************************************************************
38 */
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 /* System include files */
43 #include <stdio.h>
44 #include <stdint.h>
45 #include <string.h>
46 #include <stdlib.h>
47 #include <assert.h>
48 #include <immintrin.h>
49 
50 /* User include files */
51 #include "icv_datatypes.h"
52 #include "icv_macros.h"
53 #include "icv_platform_macros.h"
54 #include "icv.h"
55 
56 /**
57 *******************************************************************************
58 *
59 * @brief
60 *  Compute 8x4 SAD
61 *
62 * @par   Description
63 *  Compute 8x4 sum of absolute differences between source and reference block
64 *
65 * @param[in] pu1_src
66 *  Source buffer
67 *
68 * @param[in] pu1_ref
69 *  Reference buffer
70 *
71 * @param[in] src_strd
72 *  Source stride
73 *
74 * @param[in] ref_strd
75 *  Reference stride
76 *
77 * @param[in] wd
78 *  Assumed to be 8
79 *
80 * @param[in] ht
81 *  Assumed to be 4
82 
83 * @returns
84 *  SAD
85 *
86 * @remarks
87 *
88 *******************************************************************************
89 */
90 WORD32 icv_sad_8x4_ssse3(UWORD8 *pu1_src,
91                          UWORD8 *pu1_ref,
92                          WORD32 src_strd,
93                          WORD32 ref_strd,
94                          WORD32 wd,
95                          WORD32 ht)
96 {
97     WORD32 sad;
98     __m128 src_r0, src_r1;
99     __m128 ref_r0, ref_r1;
100     __m128i res_r0, res_r1;
101 
102     UNUSED(wd);
103     UNUSED(ht);
104     ASSERT(wd == 8);
105     ASSERT(ht == 4);
106 
107     /* Load source */
108     src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
109     pu1_src += src_strd;
110 
111     src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
112     pu1_src += src_strd;
113 
114     src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
115     pu1_src += src_strd;
116 
117     src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
118     pu1_src += src_strd;
119 
120 
121     /* Load reference */
122     ref_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
123     pu1_ref += ref_strd;
124 
125     ref_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
126     pu1_ref += ref_strd;
127 
128     ref_r0 = _mm_loadh_pi (ref_r0, (__m64 *) (pu1_ref));
129     pu1_ref += ref_strd;
130 
131     ref_r1 = _mm_loadh_pi (ref_r1, (__m64 *) (pu1_ref));
132     pu1_ref += ref_strd;
133 
134     /* Compute SAD for each row */
135     res_r0 = _mm_sad_epu8((__m128i)src_r0, (__m128i)ref_r0);
136     res_r1 = _mm_sad_epu8((__m128i)src_r1, (__m128i)ref_r1);
137 
138     /* Accumulate SAD */
139     res_r0 = _mm_add_epi64(res_r0,  res_r1);
140     res_r0 = _mm_add_epi64(res_r0, _mm_srli_si128(res_r0, 8));
141 
142     sad  = _mm_cvtsi128_si32(res_r0);
143 
144     return sad;
145 }
146