1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  icv_variance_sse42.c
24 *
25 * @brief
26 *  This file contains the functions to compute variance
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *  icv_variance_8x4_ssse3()
33 *
34 * @remarks
35 *  None
36 *
37 *******************************************************************************
38 */
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 /* System include files */
43 #include <stdio.h>
44 #include <stdint.h>
45 #include <string.h>
46 #include <stdlib.h>
47 #include <assert.h>
48 #include <immintrin.h>
49 
50 /* User include files */
51 #include "icv_datatypes.h"
52 #include "icv_macros.h"
53 #include "icv_platform_macros.h"
54 #include "icv.h"
55 
56 /**
57 *******************************************************************************
58 *
59 * @brief
60 *  Computes variance of a given 8x4 block
61 *
62 * @par   Description
63 *  Compute variance of a given 8x4 block
64 *
65 * @param[in] pu1_src
66 *  Source
67 *
68 * @param[in] src_strd
69 *  Source stride
70 *
71 * @param[in] wd
72 *  Assumed to be 8
73 *
74 * @param[in] ht
75 *  Assumed to be 4
76 *
77 * @returns
78 *  Variance
79 *
80 * @remarks
81 *
82 *******************************************************************************
83 */
84 WORD32 icv_variance_8x4_ssse3(UWORD8 *pu1_src, WORD32 src_strd, WORD32 wd, WORD32 ht)
85 {
86     WORD32 sum;
87     WORD32 sum_sqr;
88     WORD32 blk_sz;
89     WORD32 vrnc;
90     __m128  src_r0, src_r1;
91     __m128i ssrc_r0, ssrc_r1, ssrc_r2, ssrc_r3;
92     __m128i sum_r0, sum_r1;
93     __m128i sqr_r0, sqr_r1, sqr_r2, sqr_r3;
94     __m128i vsum, vsum_sqr;
95     __m128i zero;
96     UNUSED(wd);
97     UNUSED(ht);
98 
99     ASSERT(wd == 8);
100     ASSERT(ht == 4);
101 
102     sum     = 0;
103     sum_sqr = 0;
104 
105     blk_sz = 8 * 4;
106 
107     zero = _mm_setzero_si128();
108 
109     /* Load source */
110     src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
111     pu1_src += src_strd;
112 
113     src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
114     pu1_src += src_strd;
115 
116     src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
117     pu1_src += src_strd;
118 
119     src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
120     pu1_src += src_strd;
121 
122     /* Compute sum of all elements */
123     /* Use SAD with 0, since there is no pairwise addition */
124     sum_r0  = _mm_sad_epu8((__m128i)src_r0, zero);
125     sum_r1  = _mm_sad_epu8((__m128i)src_r1, zero);
126 
127     /* Accumulate SAD */
128     vsum    = _mm_add_epi64(sum_r0, sum_r1);
129     vsum    = _mm_add_epi64(vsum, _mm_srli_si128(vsum, 8));
130 
131     sum = _mm_cvtsi128_si32(vsum);
132 
133     /* Unpack to 16 bits */
134     ssrc_r0 = _mm_unpacklo_epi8((__m128i)src_r0, zero);
135     ssrc_r1 = _mm_unpacklo_epi8((__m128i)src_r1, zero);
136     ssrc_r2 = _mm_unpackhi_epi8((__m128i)src_r0, zero);
137     ssrc_r3 = _mm_unpackhi_epi8((__m128i)src_r1, zero);
138 
139     /* Compute sum of squares */
140     sqr_r0 = _mm_madd_epi16(ssrc_r0,  ssrc_r0);
141     sqr_r1 = _mm_madd_epi16(ssrc_r1,  ssrc_r1);
142     sqr_r2 = _mm_madd_epi16(ssrc_r2,  ssrc_r2);
143     sqr_r3 = _mm_madd_epi16(ssrc_r3,  ssrc_r3);
144 
145     vsum_sqr = _mm_add_epi32(sqr_r0,   sqr_r1);
146     vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r2);
147     vsum_sqr = _mm_add_epi32(vsum_sqr, sqr_r3);
148 
149     vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 8));
150     vsum_sqr = _mm_add_epi32(vsum_sqr, _mm_srli_si128(vsum_sqr, 4));
151     sum_sqr  = _mm_cvtsi128_si32(vsum_sqr);
152 
153     /* Compute variance */
154     vrnc = ((sum_sqr * blk_sz) - (sum * sum)) / (blk_sz * blk_sz);
155 
156     return vrnc;
157 }
158 
159