1/*
2 *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "webrtc/modules/desktop_capture/differ_block_sse2.h"
12
13#if defined(_MSC_VER)
14#include <intrin.h>
15#else
16#include <mmintrin.h>
17#include <emmintrin.h>
18#endif
19
20#include "webrtc/modules/desktop_capture/differ_block.h"
21
22namespace webrtc {
23
24extern int BlockDifference_SSE2_W16(const uint8_t* image1,
25                                    const uint8_t* image2,
26                                    int stride) {
27  __m128i acc = _mm_setzero_si128();
28  __m128i v0;
29  __m128i v1;
30  __m128i sad;
31  for (int y = 0; y < kBlockSize; ++y) {
32    const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
33    const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
34    v0 = _mm_loadu_si128(i1);
35    v1 = _mm_loadu_si128(i2);
36    sad = _mm_sad_epu8(v0, v1);
37    acc = _mm_adds_epu16(acc, sad);
38    v0 = _mm_loadu_si128(i1 + 1);
39    v1 = _mm_loadu_si128(i2 + 1);
40    sad = _mm_sad_epu8(v0, v1);
41    acc = _mm_adds_epu16(acc, sad);
42    v0 = _mm_loadu_si128(i1 + 2);
43    v1 = _mm_loadu_si128(i2 + 2);
44    sad = _mm_sad_epu8(v0, v1);
45    acc = _mm_adds_epu16(acc, sad);
46    v0 = _mm_loadu_si128(i1 + 3);
47    v1 = _mm_loadu_si128(i2 + 3);
48    sad = _mm_sad_epu8(v0, v1);
49    acc = _mm_adds_epu16(acc, sad);
50
51    // This essential means sad = acc >> 64. We only care about the lower 16
52    // bits.
53    sad = _mm_shuffle_epi32(acc, 0xEE);
54    sad = _mm_adds_epu16(sad, acc);
55    int diff = _mm_cvtsi128_si32(sad);
56    if (diff)
57      return 1;
58    image1 += stride;
59    image2 += stride;
60  }
61  return 0;
62}
63
64extern int BlockDifference_SSE2_W32(const uint8_t* image1,
65                                    const uint8_t* image2,
66                                    int stride) {
67  __m128i acc = _mm_setzero_si128();
68  __m128i v0;
69  __m128i v1;
70  __m128i sad;
71  for (int y = 0; y < kBlockSize; ++y) {
72    const __m128i* i1 = reinterpret_cast<const __m128i*>(image1);
73    const __m128i* i2 = reinterpret_cast<const __m128i*>(image2);
74    v0 = _mm_loadu_si128(i1);
75    v1 = _mm_loadu_si128(i2);
76    sad = _mm_sad_epu8(v0, v1);
77    acc = _mm_adds_epu16(acc, sad);
78    v0 = _mm_loadu_si128(i1 + 1);
79    v1 = _mm_loadu_si128(i2 + 1);
80    sad = _mm_sad_epu8(v0, v1);
81    acc = _mm_adds_epu16(acc, sad);
82    v0 = _mm_loadu_si128(i1 + 2);
83    v1 = _mm_loadu_si128(i2 + 2);
84    sad = _mm_sad_epu8(v0, v1);
85    acc = _mm_adds_epu16(acc, sad);
86    v0 = _mm_loadu_si128(i1 + 3);
87    v1 = _mm_loadu_si128(i2 + 3);
88    sad = _mm_sad_epu8(v0, v1);
89    acc = _mm_adds_epu16(acc, sad);
90    v0 = _mm_loadu_si128(i1 + 4);
91    v1 = _mm_loadu_si128(i2 + 4);
92    sad = _mm_sad_epu8(v0, v1);
93    acc = _mm_adds_epu16(acc, sad);
94    v0 = _mm_loadu_si128(i1 + 5);
95    v1 = _mm_loadu_si128(i2 + 5);
96    sad = _mm_sad_epu8(v0, v1);
97    acc = _mm_adds_epu16(acc, sad);
98    v0 = _mm_loadu_si128(i1 + 6);
99    v1 = _mm_loadu_si128(i2 + 6);
100    sad = _mm_sad_epu8(v0, v1);
101    acc = _mm_adds_epu16(acc, sad);
102    v0 = _mm_loadu_si128(i1 + 7);
103    v1 = _mm_loadu_si128(i2 + 7);
104    sad = _mm_sad_epu8(v0, v1);
105    acc = _mm_adds_epu16(acc, sad);
106
107    // This essential means sad = acc >> 64. We only care about the lower 16
108    // bits.
109    sad = _mm_shuffle_epi32(acc, 0xEE);
110    sad = _mm_adds_epu16(sad, acc);
111    int diff = _mm_cvtsi128_si32(sad);
112    if (diff)
113      return 1;
114    image1 += stride;
115    image2 += stride;
116  }
117  return 0;
118}
119
120}  // namespace webrtc
121