1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // NEON implementations of Image methods for compatible devices.  Control
17 // should never enter this compilation unit on incompatible devices.
18 
19 #ifdef __ARM_NEON
20 
21 #include <arm_neon.h>
22 #include <stdint.h>
23 
24 #include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
25 #include "tensorflow/tools/android/test/jni/object_tracking/image.h"
26 #include "tensorflow/tools/android/test/jni/object_tracking/image_utils.h"
27 #include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
28 
29 namespace tf_tracking {
30 
31 // This function does the bulk of the work.
32 template <>
Downsample2x32ColumnsNeon(const uint8_t * const original,const int stride,const int orig_x)33 void Image<uint8_t>::Downsample2x32ColumnsNeon(const uint8_t* const original,
34                                                const int stride,
35                                                const int orig_x) {
36   // Divide input x offset by 2 to find output offset.
37   const int new_x = orig_x >> 1;
38 
39   // Initial offset into top row.
40   const uint8_t* offset = original + orig_x;
41 
42   // This points to the leftmost pixel of our 8 horizontally arranged
43   // pixels in the destination data.
44   uint8_t* ptr_dst = (*this)[0] + new_x;
45 
46   // Sum along vertical columns.
47   // Process 32x2 input pixels and 16x1 output pixels per iteration.
48   for (int new_y = 0; new_y < height_; ++new_y) {
49     uint16x8_t accum1 = vdupq_n_u16(0);
50     uint16x8_t accum2 = vdupq_n_u16(0);
51 
52     // Go top to bottom across the four rows of input pixels that make up
53     // this output row.
54     for (int row_num = 0; row_num < 2; ++row_num) {
55       // First 16 bytes.
56       {
57         // Load 16 bytes of data from current offset.
58         const uint8x16_t curr_data1 = vld1q_u8(offset);
59 
60         // Pairwise add and accumulate into accum vectors (16 bit to account
61         // for values above 255).
62         accum1 = vpadalq_u8(accum1, curr_data1);
63       }
64 
65       // Second 16 bytes.
66       {
67         // Load 16 bytes of data from current offset.
68         const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
69 
70         // Pairwise add and accumulate into accum vectors (16 bit to account
71         // for values above 255).
72         accum2 = vpadalq_u8(accum2, curr_data2);
73       }
74 
75       // Move offset down one row.
76       offset += stride;
77     }
78 
79     // Divide by 4 (number of input pixels per output
80     // pixel) and narrow data from 16 bits per pixel to 8 bpp.
81     const uint8x8_t tmp_pix1 = vqshrn_n_u16(accum1, 2);
82     const uint8x8_t tmp_pix2 = vqshrn_n_u16(accum2, 2);
83 
84     // Concatenate 8x1 pixel strips into 16x1 pixel strip.
85     const uint8x16_t allpixels = vcombine_u8(tmp_pix1, tmp_pix2);
86 
87     // Copy all pixels from composite 16x1 vector into output strip.
88     vst1q_u8(ptr_dst, allpixels);
89 
90     ptr_dst += stride_;
91   }
92 }
93 
94 // This function does the bulk of the work.
95 template <>
Downsample4x32ColumnsNeon(const uint8_t * const original,const int stride,const int orig_x)96 void Image<uint8_t>::Downsample4x32ColumnsNeon(const uint8_t* const original,
97                                                const int stride,
98                                                const int orig_x) {
99   // Divide input x offset by 4 to find output offset.
100   const int new_x = orig_x >> 2;
101 
102   // Initial offset into top row.
103   const uint8_t* offset = original + orig_x;
104 
105   // This points to the leftmost pixel of our 8 horizontally arranged
106   // pixels in the destination data.
107   uint8_t* ptr_dst = (*this)[0] + new_x;
108 
109   // Sum along vertical columns.
110   // Process 32x4 input pixels and 8x1 output pixels per iteration.
111   for (int new_y = 0; new_y < height_; ++new_y) {
112     uint16x8_t accum1 = vdupq_n_u16(0);
113     uint16x8_t accum2 = vdupq_n_u16(0);
114 
115     // Go top to bottom across the four rows of input pixels that make up
116     // this output row.
117     for (int row_num = 0; row_num < 4; ++row_num) {
118       // First 16 bytes.
119       {
120         // Load 16 bytes of data from current offset.
121         const uint8x16_t curr_data1 = vld1q_u8(offset);
122 
123         // Pairwise add and accumulate into accum vectors (16 bit to account
124         // for values above 255).
125         accum1 = vpadalq_u8(accum1, curr_data1);
126       }
127 
128       // Second 16 bytes.
129       {
130         // Load 16 bytes of data from current offset.
131         const uint8x16_t curr_data2 = vld1q_u8(offset + 16);
132 
133         // Pairwise add and accumulate into accum vectors (16 bit to account
134         // for values above 255).
135         accum2 = vpadalq_u8(accum2, curr_data2);
136       }
137 
138       // Move offset down one row.
139       offset += stride;
140     }
141 
142     // Add and widen, then divide by 16 (number of input pixels per output
143     // pixel) and narrow data from 32 bits per pixel to 16 bpp.
144     const uint16x4_t tmp_pix1 = vqshrn_n_u32(vpaddlq_u16(accum1), 4);
145     const uint16x4_t tmp_pix2 = vqshrn_n_u32(vpaddlq_u16(accum2), 4);
146 
147     // Combine 4x1 pixel strips into 8x1 pixel strip and narrow from
148     // 16 bits to 8 bits per pixel.
149     const uint8x8_t allpixels = vmovn_u16(vcombine_u16(tmp_pix1, tmp_pix2));
150 
151     // Copy all pixels from composite 8x1 vector into output strip.
152     vst1_u8(ptr_dst, allpixels);
153 
154     ptr_dst += stride_;
155   }
156 }
157 
158 
159 // Hardware accelerated downsampling method for supported devices.
160 // Requires that image size be a multiple of 16 pixels in each dimension,
161 // and that downsampling be by a factor of 2 or 4.
162 template <>
DownsampleAveragedNeon(const uint8_t * const original,const int stride,const int factor)163 void Image<uint8_t>::DownsampleAveragedNeon(const uint8_t* const original,
164                                             const int stride,
165                                             const int factor) {
166   // TODO(andrewharp): stride is a bad approximation for the src image's width.
167   // Better to pass that in directly.
168   SCHECK(width_ * factor <= stride, "Uh oh!");
169   const int last_starting_index = width_ * factor - 32;
170 
171   // We process 32 input pixels lengthwise at a time.
172   // The output per pass of this loop is an 8 wide by downsampled height tall
173   // pixel strip.
174   int orig_x = 0;
175   for (; orig_x <= last_starting_index; orig_x += 32) {
176     if (factor == 2) {
177       Downsample2x32ColumnsNeon(original, stride, orig_x);
178     } else {
179       Downsample4x32ColumnsNeon(original, stride, orig_x);
180     }
181   }
182 
183   // If a last pass is required, push it to the left enough so that it never
184   // goes out of bounds. This will result in some extra computation on devices
185   // whose frame widths are multiples of 16 and not 32.
186   if (orig_x < last_starting_index + 32) {
187     if (factor == 2) {
188       Downsample2x32ColumnsNeon(original, stride, last_starting_index);
189     } else {
190       Downsample4x32ColumnsNeon(original, stride, last_starting_index);
191     }
192   }
193 }
194 
195 
196 // Puts the image gradient matrix about a pixel into the 2x2 float array G.
197 // vals_x should be an array of the window x gradient values, whose indices
198 // can be in any order but are parallel to the vals_y entries.
199 // See http://robots.stanford.edu/cs223b04/algo_tracking.pdf for more details.
CalculateGNeon(const float * const vals_x,const float * const vals_y,const int num_vals,float * const G)200 void CalculateGNeon(const float* const vals_x, const float* const vals_y,
201                     const int num_vals, float* const G) {
202   const float32_t* const arm_vals_x = (const float32_t*) vals_x;
203   const float32_t* const arm_vals_y = (const float32_t*) vals_y;
204 
205   // Running sums.
206   float32x4_t xx = vdupq_n_f32(0.0f);
207   float32x4_t xy = vdupq_n_f32(0.0f);
208   float32x4_t yy = vdupq_n_f32(0.0f);
209 
210   // Maximum index we can load 4 consecutive values from.
211   // e.g. if there are 81 values, our last full pass can be from index 77:
212   // 81-4=>77 (77, 78, 79, 80)
213   const int max_i = num_vals - 4;
214 
215   // Defined here because we want to keep track of how many values were
216   // processed by NEON, so that we can finish off the remainder the normal
217   // way.
218   int i = 0;
219 
220   // Process values 4 at a time, accumulating the sums of
221   // the pixel-wise x*x, x*y, and y*y values.
222   for (; i <= max_i; i += 4) {
223     // Load xs
224     float32x4_t x = vld1q_f32(arm_vals_x + i);
225 
226     // Multiply x*x and accumulate.
227     xx = vmlaq_f32(xx, x, x);
228 
229     // Load ys
230     float32x4_t y = vld1q_f32(arm_vals_y + i);
231 
232     // Multiply x*y and accumulate.
233     xy = vmlaq_f32(xy, x, y);
234 
235     // Multiply y*y and accumulate.
236     yy = vmlaq_f32(yy, y, y);
237   }
238 
239   static float32_t xx_vals[4];
240   static float32_t xy_vals[4];
241   static float32_t yy_vals[4];
242 
243   vst1q_f32(xx_vals, xx);
244   vst1q_f32(xy_vals, xy);
245   vst1q_f32(yy_vals, yy);
246 
247   // Accumulated values are store in sets of 4, we have to manually add
248   // the last bits together.
249   for (int j = 0; j < 4; ++j) {
250     G[0] += xx_vals[j];
251     G[1] += xy_vals[j];
252     G[3] += yy_vals[j];
253   }
254 
255   // Finishes off last few values (< 4) from above.
256   for (; i < num_vals; ++i) {
257     G[0] += Square(vals_x[i]);
258     G[1] += vals_x[i] * vals_y[i];
259     G[3] += Square(vals_y[i]);
260   }
261 
262   // The matrix is symmetric, so this is a given.
263   G[2] = G[1];
264 }
265 
266 }  // namespace tf_tracking
267 
268 #endif
269