1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "./vpx_config.h"
14 
15 #include "vpx_ports/mem.h"
16 #include "vpx/vpx_integer.h"
17 
18 #include "vpx_dsp/variance.h"
19 
20 static const uint8_t bilinear_filters[8][2] = {
21   { 128,   0, },
22   { 112,  16, },
23   {  96,  32, },
24   {  80,  48, },
25   {  64,  64, },
26   {  48,  80, },
27   {  32,  96, },
28   {  16, 112, },
29 };
30 
var_filter_block2d_bil_w8(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)31 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
32                                       uint8_t *output_ptr,
33                                       unsigned int src_pixels_per_line,
34                                       int pixel_step,
35                                       unsigned int output_height,
36                                       unsigned int output_width,
37                                       const uint8_t *filter) {
38   const uint8x8_t f0 = vmov_n_u8(filter[0]);
39   const uint8x8_t f1 = vmov_n_u8(filter[1]);
40   unsigned int i;
41   for (i = 0; i < output_height; ++i) {
42     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
43     const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
44     const uint16x8_t a = vmull_u8(src_0, f0);
45     const uint16x8_t b = vmlal_u8(a, src_1, f1);
46     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
47     vst1_u8(&output_ptr[0], out);
48     // Next row...
49     src_ptr += src_pixels_per_line;
50     output_ptr += output_width;
51   }
52 }
53 
var_filter_block2d_bil_w16(const uint8_t * src_ptr,uint8_t * output_ptr,unsigned int src_pixels_per_line,int pixel_step,unsigned int output_height,unsigned int output_width,const uint8_t * filter)54 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
55                                        uint8_t *output_ptr,
56                                        unsigned int src_pixels_per_line,
57                                        int pixel_step,
58                                        unsigned int output_height,
59                                        unsigned int output_width,
60                                        const uint8_t *filter) {
61   const uint8x8_t f0 = vmov_n_u8(filter[0]);
62   const uint8x8_t f1 = vmov_n_u8(filter[1]);
63   unsigned int i, j;
64   for (i = 0; i < output_height; ++i) {
65     for (j = 0; j < output_width; j += 16) {
66       const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
67       const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
68       const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
69       const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
70       const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
71       const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
72       const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
73       const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
74       vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
75     }
76     // Next row...
77     src_ptr += src_pixels_per_line;
78     output_ptr += output_width;
79   }
80 }
81 
vpx_sub_pixel_variance8x8_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)82 unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
83                                             int src_stride,
84                                             int xoffset,
85                                             int yoffset,
86                                             const uint8_t *dst,
87                                             int dst_stride,
88                                             unsigned int *sse) {
89   DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
90   DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
91 
92   var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
93                             9, 8,
94                             bilinear_filters[xoffset]);
95   var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
96                             8, bilinear_filters[yoffset]);
97   return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
98 }
99 
vpx_sub_pixel_variance16x16_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)100 unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
101                                               int src_stride,
102                                               int xoffset,
103                                               int yoffset,
104                                               const uint8_t *dst,
105                                               int dst_stride,
106                                               unsigned int *sse) {
107   DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
108   DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
109 
110   var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
111                              17, 16,
112                              bilinear_filters[xoffset]);
113   var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
114                              16, bilinear_filters[yoffset]);
115   return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
116 }
117 
vpx_sub_pixel_variance32x32_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)118 unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
119                                               int src_stride,
120                                               int xoffset,
121                                               int yoffset,
122                                               const uint8_t *dst,
123                                               int dst_stride,
124                                               unsigned int *sse) {
125   DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
126   DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
127 
128   var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
129                              33, 32,
130                              bilinear_filters[xoffset]);
131   var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
132                              32, bilinear_filters[yoffset]);
133   return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
134 }
135 
vpx_sub_pixel_variance64x64_neon(const uint8_t * src,int src_stride,int xoffset,int yoffset,const uint8_t * dst,int dst_stride,unsigned int * sse)136 unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
137                                               int src_stride,
138                                               int xoffset,
139                                               int yoffset,
140                                               const uint8_t *dst,
141                                               int dst_stride,
142                                               unsigned int *sse) {
143   DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
144   DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
145 
146   var_filter_block2d_bil_w16(src, fdata3, src_stride, 1,
147                              65, 64,
148                              bilinear_filters[xoffset]);
149   var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
150                              64, bilinear_filters[yoffset]);
151   return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
152 }
153