1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "./vpx_config.h"
14
15 #include "vpx_ports/mem.h"
16 #include "vpx/vpx_integer.h"
17
18 #include "vpx_dsp/variance.h"
19
20 static const uint8_t bilinear_filters[8][2] = {
21 { 128, 0, },
22 { 112, 16, },
23 { 96, 32, },
24 { 80, 48, },
25 { 64, 64, },
26 { 48, 80, },
27 { 32, 96, },
28 { 16, 112, },
29 };
30
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)31 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
32 uint8_t *output_ptr,
33 unsigned int src_pixels_per_line,
34 int pixel_step,
35 unsigned int output_height,
36 unsigned int output_width,
37 const uint8_t *filter) {
38 const uint8x8_t f0 = vmov_n_u8(filter[0]);
39 const uint8x8_t f1 = vmov_n_u8(filter[1]);
40 unsigned int i;
41 for (i = 0; i < output_height; ++i) {
42 const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
43 const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
44 const uint16x8_t a = vmull_u8(src_0, f0);
45 const uint16x8_t b = vmlal_u8(a, src_1, f1);
46 const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
47 vst1_u8(&output_ptr[0], out);
48 // Next row...
49 src_ptr += src_pixels_per_line;
50 output_ptr += output_width;
51 }
52 }
53
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)54 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
55 uint8_t *output_ptr,
56 unsigned int src_pixels_per_line,
57 int pixel_step,
58 unsigned int output_height,
59 unsigned int output_width,
60 const uint8_t *filter) {
61 const uint8x8_t f0 = vmov_n_u8(filter[0]);
62 const uint8x8_t f1 = vmov_n_u8(filter[1]);
63 unsigned int i, j;
64 for (i = 0; i < output_height; ++i) {
65 for (j = 0; j < output_width; j += 16) {
66 const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
67 const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
68 const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
69 const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
70 const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
71 const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
72 const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
73 const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
74 vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
75 }
76 // Next row...
77 src_ptr += src_pixels_per_line;
78 output_ptr += output_width;
79 }
80 }
81
vpx_sub_pixel_variance8x8_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)82 unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
83 int src_stride,
84 int xoffset,
85 int yoffset,
86 const uint8_t *dst,
87 int dst_stride,
88 unsigned int *sse) {
89 DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
90 DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
91
92 var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
93 9, 8,
94 bilinear_filters[xoffset]);
95 var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
96 8, bilinear_filters[yoffset]);
97 return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
98 }
99
vpx_sub_pixel_variance16x16_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)100 unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
101 int src_stride,
102 int xoffset,
103 int yoffset,
104 const uint8_t *dst,
105 int dst_stride,
106 unsigned int *sse) {
107 DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
108 DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
109
110 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
111 17, 16,
112 bilinear_filters[xoffset]);
113 var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
114 16, bilinear_filters[yoffset]);
115 return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
116 }
117
vpx_sub_pixel_variance32x32_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)118 unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
119 int src_stride,
120 int xoffset,
121 int yoffset,
122 const uint8_t *dst,
123 int dst_stride,
124 unsigned int *sse) {
125 DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
126 DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
127
128 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
129 33, 32,
130 bilinear_filters[xoffset]);
131 var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
132 32, bilinear_filters[yoffset]);
133 return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
134 }
135
vpx_sub_pixel_variance64x64_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)136 unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
137 int src_stride,
138 int xoffset,
139 int yoffset,
140 const uint8_t *dst,
141 int dst_stride,
142 unsigned int *sse) {
143 DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
144 DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
145
146 var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
147 65, 64,
148 bilinear_filters[xoffset]);
149 var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
150 64, bilinear_filters[yoffset]);
151 return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
152 }
153