1 /*
2  *  Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "modules/desktop_capture/differ_vector_sse2.h"
12 
13 #if defined(_MSC_VER)
14 #include <intrin.h>
15 #else
16 #include <emmintrin.h>
17 #include <mmintrin.h>
18 #endif
19 
20 namespace webrtc {
21 
VectorDifference_SSE2_W16(const uint8_t * image1,const uint8_t * image2)22 extern bool VectorDifference_SSE2_W16(const uint8_t* image1,
23                                       const uint8_t* image2) {
24   __m128i acc = _mm_setzero_si128();
25   __m128i v0;
26   __m128i v1;
27   __m128i sad;
28   const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
29   const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
30   v0 = _mm_loadu_si128(i1);
31   v1 = _mm_loadu_si128(i2);
32   sad = _mm_sad_epu8(v0, v1);
33   acc = _mm_adds_epu16(acc, sad);
34   v0 = _mm_loadu_si128(i1 + 1);
35   v1 = _mm_loadu_si128(i2 + 1);
36   sad = _mm_sad_epu8(v0, v1);
37   acc = _mm_adds_epu16(acc, sad);
38   v0 = _mm_loadu_si128(i1 + 2);
39   v1 = _mm_loadu_si128(i2 + 2);
40   sad = _mm_sad_epu8(v0, v1);
41   acc = _mm_adds_epu16(acc, sad);
42   v0 = _mm_loadu_si128(i1 + 3);
43   v1 = _mm_loadu_si128(i2 + 3);
44   sad = _mm_sad_epu8(v0, v1);
45   acc = _mm_adds_epu16(acc, sad);
46 
47   // This essential means sad = acc >> 64. We only care about the lower 16
48   // bits.
49   sad = _mm_shuffle_epi32(acc, 0xEE);
50   sad = _mm_adds_epu16(sad, acc);
51   return _mm_cvtsi128_si32(sad) != 0;
52 }
53 
VectorDifference_SSE2_W32(const uint8_t * image1,const uint8_t * image2)54 extern bool VectorDifference_SSE2_W32(const uint8_t* image1,
55                                       const uint8_t* image2) {
56   __m128i acc = _mm_setzero_si128();
57   __m128i v0;
58   __m128i v1;
59   __m128i sad;
60   const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
61   const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
62   v0 = _mm_loadu_si128(i1);
63   v1 = _mm_loadu_si128(i2);
64   sad = _mm_sad_epu8(v0, v1);
65   acc = _mm_adds_epu16(acc, sad);
66   v0 = _mm_loadu_si128(i1 + 1);
67   v1 = _mm_loadu_si128(i2 + 1);
68   sad = _mm_sad_epu8(v0, v1);
69   acc = _mm_adds_epu16(acc, sad);
70   v0 = _mm_loadu_si128(i1 + 2);
71   v1 = _mm_loadu_si128(i2 + 2);
72   sad = _mm_sad_epu8(v0, v1);
73   acc = _mm_adds_epu16(acc, sad);
74   v0 = _mm_loadu_si128(i1 + 3);
75   v1 = _mm_loadu_si128(i2 + 3);
76   sad = _mm_sad_epu8(v0, v1);
77   acc = _mm_adds_epu16(acc, sad);
78   v0 = _mm_loadu_si128(i1 + 4);
79   v1 = _mm_loadu_si128(i2 + 4);
80   sad = _mm_sad_epu8(v0, v1);
81   acc = _mm_adds_epu16(acc, sad);
82   v0 = _mm_loadu_si128(i1 + 5);
83   v1 = _mm_loadu_si128(i2 + 5);
84   sad = _mm_sad_epu8(v0, v1);
85   acc = _mm_adds_epu16(acc, sad);
86   v0 = _mm_loadu_si128(i1 + 6);
87   v1 = _mm_loadu_si128(i2 + 6);
88   sad = _mm_sad_epu8(v0, v1);
89   acc = _mm_adds_epu16(acc, sad);
90   v0 = _mm_loadu_si128(i1 + 7);
91   v1 = _mm_loadu_si128(i2 + 7);
92   sad = _mm_sad_epu8(v0, v1);
93   acc = _mm_adds_epu16(acc, sad);
94 
95   // This essential means sad = acc >> 64. We only care about the lower 16
96   // bits.
97   sad = _mm_shuffle_epi32(acc, 0xEE);
98   sad = _mm_adds_epu16(sad, acc);
99   return _mm_cvtsi128_si32(sad) != 0;
100 }
101 
102 }  // namespace webrtc
103