1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <emmintrin.h>
13 
14 #include <stdio.h>
15 
16 #include "./vpx_dsp_rtcd.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_dsp/x86/mem_sse2.h"
19 
20 extern const int16_t vpx_rv[];
21 
vpx_mbpost_proc_down_sse2(unsigned char * dst,int pitch,int rows,int cols,int flimit)22 void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
23                                int cols, int flimit) {
24   int col;
25   const __m128i zero = _mm_setzero_si128();
26   const __m128i f = _mm_set1_epi32(flimit);
27   DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
28 
29   // 8 columns are processed at a time.
30   // If rows is less than 8 the bottom border extension fails.
31   assert(cols % 8 == 0);
32   assert(rows >= 8);
33 
34   for (col = 0; col < cols; col += 8) {
35     int row, i;
36     __m128i s = _mm_loadl_epi64((__m128i *)dst);
37     __m128i sum, sumsq_0, sumsq_1;
38     __m128i tmp_0, tmp_1;
39     __m128i below_context;
40 
41     s = _mm_unpacklo_epi8(s, zero);
42 
43     for (i = 0; i < 8; ++i) {
44       _mm_store_si128((__m128i *)above_context + i, s);
45     }
46 
47     // sum *= 9
48     sum = _mm_slli_epi16(s, 3);
49     sum = _mm_add_epi16(s, sum);
50 
51     // sum^2 * 9 == (sum * 9) * sum
52     tmp_0 = _mm_mullo_epi16(sum, s);
53     tmp_1 = _mm_mulhi_epi16(sum, s);
54 
55     sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
56     sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
57 
58     // Prime sum/sumsq
59     for (i = 1; i <= 6; ++i) {
60       __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
61       a = _mm_unpacklo_epi8(a, zero);
62       sum = _mm_add_epi16(sum, a);
63       a = _mm_mullo_epi16(a, a);
64       sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
65       sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
66     }
67 
68     for (row = 0; row < rows + 8; row++) {
69       const __m128i above =
70           _mm_load_si128((__m128i *)above_context + (row & 7));
71       __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
72       __m128i above_sq, below_sq;
73       __m128i mask_0, mask_1;
74       __m128i multmp_0, multmp_1;
75       __m128i rv;
76       __m128i out;
77 
78       this_row = _mm_unpacklo_epi8(this_row, zero);
79 
80       if (row + 7 < rows) {
81         // Instead of copying the end context we just stop loading when we get
82         // to the last one.
83         below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
84         below_context = _mm_unpacklo_epi8(below_context, zero);
85       }
86 
87       sum = _mm_sub_epi16(sum, above);
88       sum = _mm_add_epi16(sum, below_context);
89 
90       // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
91       // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
92       // because x86 does not have unpack with sign extension.
93       above_sq = _mm_mullo_epi16(above, above);
94       sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
95       sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
96 
97       below_sq = _mm_mullo_epi16(below_context, below_context);
98       sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
99       sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
100 
101       // sumsq * 16 - sumsq == sumsq * 15
102       mask_0 = _mm_slli_epi32(sumsq_0, 4);
103       mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
104       mask_1 = _mm_slli_epi32(sumsq_1, 4);
105       mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
106 
107       multmp_0 = _mm_mullo_epi16(sum, sum);
108       multmp_1 = _mm_mulhi_epi16(sum, sum);
109 
110       mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
111       mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
112 
113       // mask - f gives a negative value when mask < f
114       mask_0 = _mm_sub_epi32(mask_0, f);
115       mask_1 = _mm_sub_epi32(mask_1, f);
116 
117       // Shift the sign bit down to create a mask
118       mask_0 = _mm_srai_epi32(mask_0, 31);
119       mask_1 = _mm_srai_epi32(mask_1, 31);
120 
121       mask_0 = _mm_packs_epi32(mask_0, mask_1);
122 
123       rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
124 
125       mask_1 = _mm_add_epi16(rv, sum);
126       mask_1 = _mm_add_epi16(mask_1, this_row);
127       mask_1 = _mm_srai_epi16(mask_1, 4);
128 
129       mask_1 = _mm_and_si128(mask_0, mask_1);
130       mask_0 = _mm_andnot_si128(mask_0, this_row);
131       out = _mm_or_si128(mask_1, mask_0);
132 
133       _mm_storel_epi64((__m128i *)(dst + row * pitch),
134                        _mm_packus_epi16(out, zero));
135 
136       _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
137     }
138 
139     dst += 8;
140   }
141 }
142