1 /*
2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "webrtc/modules/desktop_capture/differ_block_sse2.h"
12
13 #if defined(_MSC_VER)
14 #include <intrin.h>
15 #else
16 #include <mmintrin.h>
17 #include <emmintrin.h>
18 #endif
19
20 #include "webrtc/modules/desktop_capture/differ_block.h"
21
22 namespace webrtc {
23
BlockDifference_SSE2_W16(const uint8_t * image1,const uint8_t * image2,int stride)24 extern bool BlockDifference_SSE2_W16(const uint8_t* image1,
25 const uint8_t* image2,
26 int stride) {
27 __m128i acc = _mm_setzero_si128();
28 __m128i v0;
29 __m128i v1;
30 __m128i sad;
31 for (int y = 0; y < kBlockSize; ++y) {
32 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
33 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
34 v0 = _mm_loadu_si128(i1);
35 v1 = _mm_loadu_si128(i2);
36 sad = _mm_sad_epu8(v0, v1);
37 acc = _mm_adds_epu16(acc, sad);
38 v0 = _mm_loadu_si128(i1 + 1);
39 v1 = _mm_loadu_si128(i2 + 1);
40 sad = _mm_sad_epu8(v0, v1);
41 acc = _mm_adds_epu16(acc, sad);
42 v0 = _mm_loadu_si128(i1 + 2);
43 v1 = _mm_loadu_si128(i2 + 2);
44 sad = _mm_sad_epu8(v0, v1);
45 acc = _mm_adds_epu16(acc, sad);
46 v0 = _mm_loadu_si128(i1 + 3);
47 v1 = _mm_loadu_si128(i2 + 3);
48 sad = _mm_sad_epu8(v0, v1);
49 acc = _mm_adds_epu16(acc, sad);
50
51 // This essential means sad = acc >> 64. We only care about the lower 16
52 // bits.
53 sad = _mm_shuffle_epi32(acc, 0xEE);
54 sad = _mm_adds_epu16(sad, acc);
55 int diff = _mm_cvtsi128_si32(sad);
56 if (diff)
57 return true;
58 image1 += stride;
59 image2 += stride;
60 }
61 return false;
62 }
63
BlockDifference_SSE2_W32(const uint8_t * image1,const uint8_t * image2,int stride)64 extern bool BlockDifference_SSE2_W32(const uint8_t* image1,
65 const uint8_t* image2,
66 int stride) {
67 __m128i acc = _mm_setzero_si128();
68 __m128i v0;
69 __m128i v1;
70 __m128i sad;
71 for (int y = 0; y < kBlockSize; ++y) {
72 const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
73 const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
74 v0 = _mm_loadu_si128(i1);
75 v1 = _mm_loadu_si128(i2);
76 sad = _mm_sad_epu8(v0, v1);
77 acc = _mm_adds_epu16(acc, sad);
78 v0 = _mm_loadu_si128(i1 + 1);
79 v1 = _mm_loadu_si128(i2 + 1);
80 sad = _mm_sad_epu8(v0, v1);
81 acc = _mm_adds_epu16(acc, sad);
82 v0 = _mm_loadu_si128(i1 + 2);
83 v1 = _mm_loadu_si128(i2 + 2);
84 sad = _mm_sad_epu8(v0, v1);
85 acc = _mm_adds_epu16(acc, sad);
86 v0 = _mm_loadu_si128(i1 + 3);
87 v1 = _mm_loadu_si128(i2 + 3);
88 sad = _mm_sad_epu8(v0, v1);
89 acc = _mm_adds_epu16(acc, sad);
90 v0 = _mm_loadu_si128(i1 + 4);
91 v1 = _mm_loadu_si128(i2 + 4);
92 sad = _mm_sad_epu8(v0, v1);
93 acc = _mm_adds_epu16(acc, sad);
94 v0 = _mm_loadu_si128(i1 + 5);
95 v1 = _mm_loadu_si128(i2 + 5);
96 sad = _mm_sad_epu8(v0, v1);
97 acc = _mm_adds_epu16(acc, sad);
98 v0 = _mm_loadu_si128(i1 + 6);
99 v1 = _mm_loadu_si128(i2 + 6);
100 sad = _mm_sad_epu8(v0, v1);
101 acc = _mm_adds_epu16(acc, sad);
102 v0 = _mm_loadu_si128(i1 + 7);
103 v1 = _mm_loadu_si128(i2 + 7);
104 sad = _mm_sad_epu8(v0, v1);
105 acc = _mm_adds_epu16(acc, sad);
106
107 // This essential means sad = acc >> 64. We only care about the lower 16
108 // bits.
109 sad = _mm_shuffle_epi32(acc, 0xEE);
110 sad = _mm_adds_epu16(sad, acc);
111 int diff = _mm_cvtsi128_si32(sad);
112 if (diff)
113 return true;
114 image1 += stride;
115 image2 += stride;
116 }
117 return false;
118 }
119
120 } // namespace webrtc
121