1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 // NEON implementations of Image methods for compatible devices.  Control
17 // should never enter this compilation unit on incompatible devices.
18 
19 #ifdef __ARM_NEON
20 
21 #include <arm_neon.h>
22 
23 #include "tensorflow/examples/android/jni/object_tracking/geom.h"
24 #include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
25 #include "tensorflow/examples/android/jni/object_tracking/image.h"
26 #include "tensorflow/examples/android/jni/object_tracking/utils.h"
27 
28 namespace tf_tracking {
29 
GetSum(const float32x4_t & values)30 inline static float GetSum(const float32x4_t& values) {
31   static float32_t summed_values[4];
32   vst1q_f32(summed_values, values);
33   return summed_values[0]
34        + summed_values[1]
35        + summed_values[2]
36        + summed_values[3];
37 }
38 
39 
ComputeMeanNeon(const float * const values,const int num_vals)40 float ComputeMeanNeon(const float* const values, const int num_vals) {
41   SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
42 
43   const float32_t* const arm_vals = (const float32_t* const) values;
44   float32x4_t accum = vdupq_n_f32(0.0f);
45 
46   int offset = 0;
47   for (; offset <= num_vals - 4; offset += 4) {
48     accum = vaddq_f32(accum, vld1q_f32(&arm_vals[offset]));
49   }
50 
51   // Pull the accumulated values into a single variable.
52   float sum = GetSum(accum);
53 
54   // Get the remaining 1 to 3 values.
55   for (; offset < num_vals; ++offset) {
56     sum += values[offset];
57   }
58 
59   const float mean_neon = sum / static_cast<float>(num_vals);
60 
61 #ifdef SANITY_CHECKS
62   const float mean_cpu = ComputeMeanCpu(values, num_vals);
63   SCHECK(NearlyEqual(mean_neon, mean_cpu, EPSILON * num_vals),
64         "Neon mismatch with CPU mean! %.10f vs %.10f",
65         mean_neon, mean_cpu);
66 #endif
67 
68   return mean_neon;
69 }
70 
71 
ComputeStdDevNeon(const float * const values,const int num_vals,const float mean)72 float ComputeStdDevNeon(const float* const values,
73                         const int num_vals, const float mean) {
74   SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
75 
76   const float32_t* const arm_vals = (const float32_t* const) values;
77   const float32x4_t mean_vec = vdupq_n_f32(-mean);
78 
79   float32x4_t accum = vdupq_n_f32(0.0f);
80 
81   int offset = 0;
82   for (; offset <= num_vals - 4; offset += 4) {
83     const float32x4_t deltas =
84         vaddq_f32(mean_vec, vld1q_f32(&arm_vals[offset]));
85 
86     accum = vmlaq_f32(accum, deltas, deltas);
87   }
88 
89   // Pull the accumulated values into a single variable.
90   float squared_sum = GetSum(accum);
91 
92   // Get the remaining 1 to 3 values.
93   for (; offset < num_vals; ++offset) {
94     squared_sum += Square(values[offset] - mean);
95   }
96 
97   const float std_dev_neon = sqrt(squared_sum / static_cast<float>(num_vals));
98 
99 #ifdef SANITY_CHECKS
100   const float std_dev_cpu = ComputeStdDevCpu(values, num_vals, mean);
101   SCHECK(NearlyEqual(std_dev_neon, std_dev_cpu, EPSILON * num_vals),
102         "Neon mismatch with CPU std dev! %.10f vs %.10f",
103         std_dev_neon, std_dev_cpu);
104 #endif
105 
106   return std_dev_neon;
107 }
108 
109 
ComputeCrossCorrelationNeon(const float * const values1,const float * const values2,const int num_vals)110 float ComputeCrossCorrelationNeon(const float* const values1,
111                                   const float* const values2,
112                                   const int num_vals) {
113   SCHECK(num_vals >= 8, "Not enough values to merit NEON: %d", num_vals);
114 
115   const float32_t* const arm_vals1 = (const float32_t* const) values1;
116   const float32_t* const arm_vals2 = (const float32_t* const) values2;
117 
118   float32x4_t accum = vdupq_n_f32(0.0f);
119 
120   int offset = 0;
121   for (; offset <= num_vals - 4; offset += 4) {
122     accum = vmlaq_f32(accum,
123                       vld1q_f32(&arm_vals1[offset]),
124                       vld1q_f32(&arm_vals2[offset]));
125   }
126 
127   // Pull the accumulated values into a single variable.
128   float sxy = GetSum(accum);
129 
130   // Get the remaining 1 to 3 values.
131   for (; offset < num_vals; ++offset) {
132     sxy += values1[offset] * values2[offset];
133   }
134 
135   const float cross_correlation_neon = sxy / num_vals;
136 
137 #ifdef SANITY_CHECKS
138   const float cross_correlation_cpu =
139       ComputeCrossCorrelationCpu(values1, values2, num_vals);
140   SCHECK(NearlyEqual(cross_correlation_neon, cross_correlation_cpu,
141                     EPSILON * num_vals),
142         "Neon mismatch with CPU cross correlation! %.10f vs %.10f",
143         cross_correlation_neon, cross_correlation_cpu);
144 #endif
145 
146   return cross_correlation_neon;
147 }
148 
149 }  // namespace tf_tracking
150 
151 #endif  // __ARM_NEON
152